[llvm] [VPlan] Add VPBundleRecipe, replacing extended reduction recipes. (PR #144281)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 26 07:12:35 PDT 2025
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/144281
>From 736357e6154d4a044ecf577d21d96e0898ea4a9d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 13 Jun 2025 21:07:04 +0100
Subject: [PATCH 1/7] VPBundleRecipe
This patch adds a new recipe to combine multiple recipes into a 'bundle'
recipe, which should be considered as single entity for cost-modeling and
transforms. The recipe needs to be 'unbundled', i.e. replaced by its
individual recipes before execute.
This subsumes VPExtendedReductionRecipe and
VPMulAccumulateReductionRecipe and should make it easier to extend to
include more types of bundled patterns, like e.g. extends folded into
loads or various arithmetic instructions, if supported by the target.
It allows avoiding re-creating the original recipes when converting to
concrete recipes, together with removing the need to record various
information. The current version of the patch still retains the original
printing matching VPExtendedReductionRecipe and VPMulAccumulateReductionRecipe,
but this specialized print could be replaced with printing the bundled recipes
directly.
Currently the unbundle implementation is a bit more complicated than
necessary, as we need to fold the extends across ops to match the
current behavior, but there's quite possibly a better place to do so.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 335 +++++++-----------
.../Transforms/Vectorize/VPlanAnalysis.cpp | 5 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 204 +++++++----
.../Transforms/Vectorize/VPlanTransforms.cpp | 133 ++-----
llvm/lib/Transforms/Vectorize/VPlanValue.h | 5 +-
.../LoopVectorize/ARM/mve-reductions.ll | 27 +-
.../vplan-printing-reductions.ll | 12 +-
7 files changed, 303 insertions(+), 418 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5a3c4a514a5dd..256706deb0977 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -525,14 +525,13 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
static inline bool classof(const VPRecipeBase *R) {
switch (R->getVPDefID()) {
+ case VPRecipeBase::VPBundleSC:
case VPRecipeBase::VPDerivedIVSC:
case VPRecipeBase::VPEVLBasedIVPHISC:
case VPRecipeBase::VPExpandSCEVSC:
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
- case VPRecipeBase::VPMulAccumulateReductionSC:
- case VPRecipeBase::VPExtendedReductionSC:
case VPRecipeBase::VPReplicateSC:
case VPRecipeBase::VPScalarIVStepsSC:
case VPRecipeBase::VPVectorPointerSC:
@@ -852,9 +851,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
- R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
- R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
- R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
+ R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
}
static inline bool classof(const VPUser *U) {
@@ -2431,29 +2428,6 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
}
setUnderlyingValue(I);
}
-
- /// For VPExtendedReductionRecipe.
- /// Note that the debug location is from the extend.
- VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
- ArrayRef<VPValue *> Operands, VPValue *CondOp,
- bool IsOrdered, DebugLoc DL)
- : VPRecipeWithIRFlags(SC, Operands, DL), RdxKind(RdxKind),
- IsOrdered(IsOrdered), IsConditional(CondOp) {
- if (CondOp)
- addOperand(CondOp);
- }
-
- /// For VPMulAccumulateReductionRecipe.
- /// Note that the NUW/NSW flags and the debug location are from the Mul.
- VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
- ArrayRef<VPValue *> Operands, VPValue *CondOp,
- bool IsOrdered, WrapFlagsTy WrapFlags, DebugLoc DL)
- : VPRecipeWithIRFlags(SC, Operands, WrapFlags, DL), RdxKind(RdxKind),
- IsOrdered(IsOrdered), IsConditional(CondOp) {
- if (CondOp)
- addOperand(CondOp);
- }
-
public:
VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
@@ -2479,9 +2453,7 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
- R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
- R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
- R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
+ R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
}
static inline bool classof(const VPUser *U) {
@@ -2620,190 +2592,6 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
}
};
-/// A recipe to represent inloop extended reduction operations, performing a
-/// reduction on a extended vector operand into a scalar value, and adding the
-/// result to a chain. This recipe is abstract and needs to be lowered to
-/// concrete recipes before codegen. The operands are {ChainOp, VecOp,
-/// [Condition]}.
-class VPExtendedReductionRecipe : public VPReductionRecipe {
- /// Opcode of the extend for VecOp.
- Instruction::CastOps ExtOp;
-
- /// The scalar type after extending.
- Type *ResultTy;
-
- /// For cloning VPExtendedReductionRecipe.
- VPExtendedReductionRecipe(VPExtendedReductionRecipe *ExtRed)
- : VPReductionRecipe(
- VPDef::VPExtendedReductionSC, ExtRed->getRecurrenceKind(),
- {ExtRed->getChainOp(), ExtRed->getVecOp()}, ExtRed->getCondOp(),
- ExtRed->isOrdered(), ExtRed->getDebugLoc()),
- ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) {
- transferFlags(*ExtRed);
- setUnderlyingValue(ExtRed->getUnderlyingValue());
- }
-
-public:
- VPExtendedReductionRecipe(VPReductionRecipe *R, VPWidenCastRecipe *Ext)
- : VPReductionRecipe(VPDef::VPExtendedReductionSC, R->getRecurrenceKind(),
- {R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(),
- R->isOrdered(), Ext->getDebugLoc()),
- ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) {
- assert((ExtOp == Instruction::CastOps::ZExt ||
- ExtOp == Instruction::CastOps::SExt) &&
- "VPExtendedReductionRecipe only supports zext and sext.");
-
- transferFlags(*Ext);
- setUnderlyingValue(R->getUnderlyingValue());
- }
-
- ~VPExtendedReductionRecipe() override = default;
-
- VPExtendedReductionRecipe *clone() override {
- return new VPExtendedReductionRecipe(this);
- }
-
- VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC);
-
- void execute(VPTransformState &State) override {
- llvm_unreachable("VPExtendedReductionRecipe should be transform to "
- "VPExtendedRecipe + VPReductionRecipe before execution.");
- };
-
- /// Return the cost of VPExtendedReductionRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-#endif
-
- /// The scalar type after extending.
- Type *getResultType() const { return ResultTy; }
-
- /// Is the extend ZExt?
- bool isZExt() const { return getExtOpcode() == Instruction::ZExt; }
-
- /// Get the opcode of the extend for VecOp.
- Instruction::CastOps getExtOpcode() const { return ExtOp; }
-};
-
-/// A recipe to represent inloop MulAccumulateReduction operations, multiplying
-/// the vector operands (which may be extended), performing a reduction.add on
-/// the result, and adding the scalar result to a chain. This recipe is abstract
-/// and needs to be lowered to concrete recipes before codegen. The operands are
-/// {ChainOp, VecOp1, VecOp2, [Condition]}.
-class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
- /// Opcode of the extend for VecOp1 and VecOp2.
- Instruction::CastOps ExtOp;
-
- /// Non-neg flag of the extend recipe.
- bool IsNonNeg = false;
-
- /// The scalar type after extending.
- Type *ResultTy = nullptr;
-
- /// For cloning VPMulAccumulateReductionRecipe.
- VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc)
- : VPReductionRecipe(
- VPDef::VPMulAccumulateReductionSC, MulAcc->getRecurrenceKind(),
- {MulAcc->getChainOp(), MulAcc->getVecOp0(), MulAcc->getVecOp1()},
- MulAcc->getCondOp(), MulAcc->isOrdered(),
- WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()),
- MulAcc->getDebugLoc()),
- ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()),
- ResultTy(MulAcc->getResultType()) {
- transferFlags(*MulAcc);
- setUnderlyingValue(MulAcc->getUnderlyingValue());
- }
-
-public:
- VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
- VPWidenCastRecipe *Ext0,
- VPWidenCastRecipe *Ext1, Type *ResultTy)
- : VPReductionRecipe(
- VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
- {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)},
- R->getCondOp(), R->isOrdered(),
- WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
- R->getDebugLoc()),
- ExtOp(Ext0->getOpcode()), ResultTy(ResultTy) {
- assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
- Instruction::Add &&
- "The reduction instruction in MulAccumulateteReductionRecipe must "
- "be Add");
- assert((ExtOp == Instruction::CastOps::ZExt ||
- ExtOp == Instruction::CastOps::SExt) &&
- "VPMulAccumulateReductionRecipe only supports zext and sext.");
- setUnderlyingValue(R->getUnderlyingValue());
- // Only set the non-negative flag if the original recipe contains.
- if (Ext0->hasNonNegFlag())
- IsNonNeg = Ext0->isNonNeg();
- }
-
- VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
- Type *ResultTy)
- : VPReductionRecipe(
- VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
- {R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)},
- R->getCondOp(), R->isOrdered(),
- WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
- R->getDebugLoc()),
- ExtOp(Instruction::CastOps::CastOpsEnd), ResultTy(ResultTy) {
- assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
- Instruction::Add &&
- "The reduction instruction in MulAccumulateReductionRecipe must be "
- "Add");
- setUnderlyingValue(R->getUnderlyingValue());
- }
-
- ~VPMulAccumulateReductionRecipe() override = default;
-
- VPMulAccumulateReductionRecipe *clone() override {
- return new VPMulAccumulateReductionRecipe(this);
- }
-
- VP_CLASSOF_IMPL(VPDef::VPMulAccumulateReductionSC);
-
- void execute(VPTransformState &State) override {
- llvm_unreachable("VPMulAccumulateReductionRecipe should transform to "
- "VPWidenCastRecipe + "
- "VPWidenRecipe + VPReductionRecipe before execution");
- }
-
- /// Return the cost of VPMulAccumulateReductionRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-#endif
-
- Type *getResultType() const { return ResultTy; }
-
- /// The first vector value to be extended and reduced.
- VPValue *getVecOp0() const { return getOperand(1); }
-
- /// The second vector value to be extended and reduced.
- VPValue *getVecOp1() const { return getOperand(2); }
-
- /// Return true if this recipe contains extended operands.
- bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; }
-
- /// Return the opcode of the extends for the operands.
- Instruction::CastOps getExtOpcode() const { return ExtOp; }
-
- /// Return if the operands are zero-extended.
- bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; }
-
- /// Return true if the operand extends have the non-negative flag.
- bool isNonNeg() const { return IsNonNeg; }
-};
-
/// VPReplicateRecipe replicates a given instruction producing multiple scalar
/// copies of the original scalar type, one per lane, instead of producing a
/// single copy of widened type for all lanes. If the instruction is known to be
@@ -2922,6 +2710,123 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
}
};
+/// A recipe to combine multiple recipes into a 'bundle' recipe, which should be
+/// considered as single entity for cost-modeling and transforms. The recipe
+/// needs to be 'unbundled', i.e. replaced by its individual recipes before
+/// execute.
+class VPBundleRecipe : public VPSingleDefRecipe {
+ enum class BundleTypes {
+ ExtendedReduction,
+ MulAccumulateReduction,
+ };
+
+ /// Recipes bundled together in this VPBundleRecipe.
+ SmallVector<VPSingleDefRecipe *> BundledOps;
+
+ /// Temporary VPValues used for external operands of the bundle, i.e. operands
+ /// not defined by recipes in the bundle.
+ SmallVector<VPValue *> TmpValues;
+
+ /// Type of the bundle.
+ BundleTypes BundleType;
+
+ VPBundleRecipe(BundleTypes BundleType, ArrayRef<VPSingleDefRecipe *> ToBundle)
+ : VPSingleDefRecipe(VPDef::VPBundleSC, {}, {}), BundledOps(ToBundle),
+ BundleType(BundleType) {
+ // Bundle up the operand recipes.
+ SmallPtrSet<VPUser *, 4> BundledUsers;
+ for (auto *R : ToBundle)
+ BundledUsers.insert(R);
+
+ // Recipes in the bundle, expect the last one, must only be used inside the
+ // bundle. If there other external users, clone the recipes for the bundle.
+ for (const auto &[Idx, R] : enumerate(drop_end(ToBundle))) {
+ if (all_of(R->users(), [&BundledUsers](VPUser *U) {
+ return BundledUsers.contains(U);
+ })) {
+ if (R->getParent())
+ R->removeFromParent();
+ continue;
+ }
+ // There users external to the bundle. Clone the recipe for use in the
+ // bundle and update all its in-bundle users.
+ this->BundledOps[Idx] = R->clone();
+ BundledUsers.insert(this->BundledOps[Idx]);
+ R->replaceUsesWithIf(this->BundledOps[Idx],
+ [&BundledUsers](VPUser &U, unsigned) {
+ return BundledUsers.contains(&U);
+ });
+ }
+ BundledOps.back()->removeFromParent();
+
+ // Internalize all external operands to the bundled operations. To do so,
+ // create new temporary VPValues for all operands not defined by recipe in
+ // the bundle. The original operands are added as operands of the
+ // VPBundleRecipe.
+ for (auto *R : this->BundledOps) {
+ for (const auto &[Idx, Op] : enumerate(R->operands())) {
+ auto *Def = Op->getDefiningRecipe();
+ if (Def && BundledUsers.contains(Def))
+ continue;
+ addOperand(Op);
+ TmpValues.push_back(new VPValue());
+ R->setOperand(Idx, TmpValues.back());
+ }
+ }
+ }
+
+public:
+ VPBundleRecipe(VPWidenCastRecipe *Ext, VPReductionRecipe *Red)
+ : VPBundleRecipe(BundleTypes::ExtendedReduction, {Ext, Red}) {}
+ VPBundleRecipe(VPWidenRecipe *Mul, VPReductionRecipe *Red)
+ : VPBundleRecipe(BundleTypes::MulAccumulateReduction, {Mul, Red}) {}
+ VPBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+ VPWidenRecipe *Mul, VPReductionRecipe *Red)
+ : VPBundleRecipe(BundleTypes::MulAccumulateReduction,
+ {Ext0, Ext1, Mul, Red}) {}
+ VPBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+ VPWidenRecipe *Mul, VPWidenCastRecipe *Ext2,
+ VPReductionRecipe *Red)
+ : VPBundleRecipe(BundleTypes::MulAccumulateReduction,
+ {Ext0, Ext1, Mul, Ext2, Red}) {}
+
+ ~VPBundleRecipe() override {
+ SmallPtrSet<VPRecipeBase *, 4> Seen;
+ for (auto *R : reverse(BundledOps))
+ if (Seen.insert(R).second)
+ delete R;
+ for (VPValue *T : TmpValues)
+ delete T;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPBundleSC)
+
+ VPBundleRecipe *clone() override {
+ return new VPBundleRecipe(BundleType, BundledOps);
+ }
+
+ /// Return the VPSingleDefRecipe producing the final result of the bundled
+ /// recipe.
+ VPSingleDefRecipe *getResultOp() const { return BundledOps.back(); }
+
+ void unbundle();
+
+ /// Generate the extraction of the appropriate bit from the block mask and the
+ /// conditional branch.
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("recipe must be removed before execute");
+ }
+
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
/// control converges back from a Branch-on-Mask. The phi nodes are needed in
/// order to merge values that are set under such a branch and feed their uses.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 76da5b0314a8e..c8336e7b3f92c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -267,6 +267,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
Type *ResultTy =
TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
+ .Case<VPBundleRecipe>([this](const auto *R) {
+ return inferScalarType(R->getOperand(R->getNumOperands() - 2));
+ })
.Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>(
@@ -296,8 +299,6 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
// TODO: Use info from interleave group.
return V->getUnderlyingValue()->getType();
})
- .Case<VPExtendedReductionRecipe, VPMulAccumulateReductionRecipe>(
- [](const auto *R) { return R->getResultType(); })
.Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
return R->getSCEV()->getType();
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 048286d7a97bc..392d6c3d32c87 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -73,8 +73,6 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
- case VPExtendedReductionSC:
- case VPMulAccumulateReductionSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -123,8 +121,6 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
- case VPExtendedReductionSC:
- case VPMulAccumulateReductionSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -163,8 +159,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
- case VPExtendedReductionSC:
- case VPMulAccumulateReductionSC:
case VPScalarIVStepsSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
@@ -2582,30 +2576,142 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
Ctx.CostKind);
}
-InstructionCost
-VPExtendedReductionRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
- unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind());
- Type *RedTy = Ctx.Types.inferScalarType(this);
- auto *SrcVecTy =
- cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF));
- assert(RedTy->isIntegerTy() &&
- "ExtendedReduction only support integer type currently.");
- return Ctx.TTI.getExtendedReductionCost(Opcode, isZExt(), RedTy, SrcVecTy,
- std::nullopt, Ctx.CostKind);
+void VPBundleRecipe::unbundle() {
+ for (auto *Op : BundledOps)
+ if (!Op->getParent())
+ Op->insertBefore(this);
+
+ for (const auto &[Idx, Op] : enumerate(operands()))
+ TmpValues[Idx]->replaceAllUsesWith(Op);
+
+ replaceAllUsesWith(getResultOp());
+
+ if (BundleType == BundleTypes::MulAccumulateReduction &&
+ BundledOps.size() == 5) {
+ // Note that we will drop the extend after mul which transforms
+ // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
+ auto *Ext0 = cast<VPWidenCastRecipe>(BundledOps[0]);
+ auto *Ext1 = cast<VPWidenCastRecipe>(BundledOps[1]);
+ auto *Ext2 = cast<VPWidenCastRecipe>(BundledOps[3]);
+ auto *Op0 =
+ new VPWidenCastRecipe(Ext0->getOpcode(), Ext0->getOperand(0),
+ Ext2->getResultType(), *Ext0, getDebugLoc());
+ Op0->insertBefore(Ext0);
+
+ VPSingleDefRecipe *Op1 = Op0;
+ if (Ext0 != Ext1) {
+ Op1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
+ Ext2->getResultType(), *Ext1, getDebugLoc());
+ Op1->insertBefore(Ext0);
+ }
+ auto *Mul = cast<VPWidenRecipe>(BundledOps[2]);
+ auto *Red = cast<VPReductionRecipe>(BundledOps[4]);
+ Mul->setOperand(0, Op0);
+ Mul->setOperand(1, Op1);
+ Red->setOperand(1, Mul);
+ Ext0->eraseFromParent();
+ Ext2->eraseFromParent();
+ if (Ext0 != Ext1)
+ Ext1->eraseFromParent();
+ }
+ BundledOps.clear();
}
-InstructionCost
-VPMulAccumulateReductionRecipe::computeCost(ElementCount VF,
+InstructionCost VPBundleRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Type *RedTy = Ctx.Types.inferScalarType(this);
- auto *SrcVecTy =
- cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF));
- return Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy,
- Ctx.CostKind);
+ auto *SrcVecTy = cast<VectorType>(
+ toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
+ assert(RedTy->isIntegerTy() &&
+ "ExtendedReduction only support integer type currently.");
+ switch (BundleType) {
+ case BundleTypes::ExtendedReduction: {
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(
+ cast<VPReductionRecipe>(BundledOps[1])->getRecurrenceKind());
+ return Ctx.TTI.getExtendedReductionCost(
+ Opcode,
+ cast<VPWidenCastRecipe>(BundledOps.front())->getOpcode() ==
+ Instruction::ZExt,
+ RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
+ }
+ case BundleTypes::MulAccumulateReduction:
+ return Ctx.TTI.getMulAccReductionCost(
+ BundledOps.size() > 2
+ ? cast<VPWidenCastRecipe>(BundledOps.front())->getOpcode() ==
+ Instruction::ZExt
+ : false,
+ RedTy, SrcVecTy, Ctx.CostKind);
+ }
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "BUNDLE ";
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ auto *Red = cast<VPReductionRecipe>(BundledOps.back());
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
+
+ switch (BundleType) {
+ case BundleTypes::ExtendedReduction: {
+ getOperand(1)->printAsOperand(O, SlotTracker);
+ O << " +";
+ O << " reduce." << Instruction::getOpcodeName(Opcode) << " (";
+ getOperand(0)->printAsOperand(O, SlotTracker);
+ Red->printFlags(O);
+
+ auto *Ext0 = cast<VPWidenCastRecipe>(BundledOps[0]);
+ O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
+ << *Ext0->getResultType();
+ if (Red->isConditional()) {
+ O << ", ";
+ Red->getCondOp()->printAsOperand(O, SlotTracker);
+ }
+ O << ")";
+ break;
+ }
+ case BundleTypes::MulAccumulateReduction: {
+ getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
+ O << " + ";
+ O << "reduce."
+ << Instruction::getOpcodeName(
+ RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
+ << " (";
+ O << "mul";
+ auto *Mul = cast<VPWidenRecipe>(BundledOps.size() == 2 ? BundledOps[0]
+ : BundledOps[2]);
+ Mul->printFlags(O);
+ bool IsExtended = BundledOps.size() > 2;
+ if (IsExtended)
+ O << "(";
+ getOperand(0)->printAsOperand(O, SlotTracker);
+ if (IsExtended) {
+ auto *Ext0 = cast<VPWidenCastRecipe>(
+ BundledOps.size() == 5 ? BundledOps[3] : BundledOps[0]);
+ O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
+ << *Ext0->getResultType() << "), (";
+ } else {
+ O << ", ";
+ }
+ getOperand(1)->printAsOperand(O, SlotTracker);
+ if (IsExtended) {
+ auto *Ext1 = cast<VPWidenCastRecipe>(
+ BundledOps.size() == 5 ? BundledOps[3] : BundledOps[1]);
+ O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
+ << *Ext1->getResultType() << ")";
+ }
+ if (Red->isConditional()) {
+ O << ", ";
+ Red->getCondOp()->printAsOperand(O, SlotTracker);
+ }
+ O << ")";
+ break;
+ }
+ }
+}
+
void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "REDUCE ";
@@ -2648,58 +2754,6 @@ void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
O << ")";
}
-void VPExtendedReductionRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "EXTENDED-REDUCE ";
- printAsOperand(O, SlotTracker);
- O << " = ";
- getChainOp()->printAsOperand(O, SlotTracker);
- O << " +";
- O << " reduce."
- << Instruction::getOpcodeName(
- RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
- << " (";
- getVecOp()->printAsOperand(O, SlotTracker);
- printFlags(O);
- O << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType();
- if (isConditional()) {
- O << ", ";
- getCondOp()->printAsOperand(O, SlotTracker);
- }
- O << ")";
-}
-
-void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "MULACC-REDUCE ";
- printAsOperand(O, SlotTracker);
- O << " = ";
- getChainOp()->printAsOperand(O, SlotTracker);
- O << " + ";
- O << "reduce."
- << Instruction::getOpcodeName(
- RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
- << " (";
- O << "mul";
- printFlags(O);
- if (isExtended())
- O << "(";
- getVecOp0()->printAsOperand(O, SlotTracker);
- if (isExtended())
- O << " " << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType()
- << "), (";
- else
- O << ", ";
- getVecOp1()->printAsOperand(O, SlotTracker);
- if (isExtended())
- O << " " << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType()
- << ")";
- if (isConditional()) {
- O << ", ";
- getCondOp()->printAsOperand(O, SlotTracker);
- }
- O << ")";
-}
#endif
/// A helper function to scalarize a single Instruction in the innermost loop.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 05a0e15f9a199..0b4cd10f35252 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1779,9 +1779,9 @@ void VPlanTransforms::truncateToMinimalBitwidths(
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
- VPWidenSelectRecipe, VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(
- &R))
+ if (!isa<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
+ VPReplicateRecipe, VPWidenSelectRecipe, VPWidenLoadRecipe,
+ VPWidenIntrinsicRecipe>(&R))
continue;
VPValue *ResultVPV = R.getVPSingleValue();
@@ -2530,83 +2530,6 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
R->dissolveToCFGLoop();
}
-// Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe.
-static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) {
- VPWidenCastRecipe *Ext;
- // Only ZExt contains non-neg flags.
- if (ExtRed->isZExt())
- Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(),
- ExtRed->getResultType(), *ExtRed,
- ExtRed->getDebugLoc());
- else
- Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(),
- ExtRed->getResultType(), {},
- ExtRed->getDebugLoc());
-
- auto *Red = new VPReductionRecipe(
- ExtRed->getRecurrenceKind(), FastMathFlags(), ExtRed->getChainOp(), Ext,
- ExtRed->getCondOp(), ExtRed->isOrdered(), ExtRed->getDebugLoc());
- Ext->insertBefore(ExtRed);
- Red->insertBefore(ExtRed);
- ExtRed->replaceAllUsesWith(Red);
- ExtRed->eraseFromParent();
-}
-
-// Expand VPMulAccumulateReductionRecipe to VPWidenRecipe (mul) +
-// VPReductionRecipe (reduce.add)
-// + VPWidenCastRecipe (optional).
-static void
-expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) {
- // Generate inner VPWidenCastRecipes if necessary.
- // Note that we will drop the extend after mul which transforms
- // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
- VPValue *Op0, *Op1;
- if (MulAcc->isExtended()) {
- Type *RedTy = MulAcc->getResultType();
- if (MulAcc->isZExt())
- Op0 = new VPWidenCastRecipe(
- MulAcc->getExtOpcode(), MulAcc->getVecOp0(), RedTy,
- VPIRFlags::NonNegFlagsTy(MulAcc->isNonNeg()), MulAcc->getDebugLoc());
- else
- Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(),
- RedTy, {}, MulAcc->getDebugLoc());
- Op0->getDefiningRecipe()->insertBefore(MulAcc);
- // Prevent reduce.add(mul(ext(A), ext(A))) generate duplicate
- // VPWidenCastRecipe.
- if (MulAcc->getVecOp0() == MulAcc->getVecOp1()) {
- Op1 = Op0;
- } else {
- if (MulAcc->isZExt())
- Op1 = new VPWidenCastRecipe(
- MulAcc->getExtOpcode(), MulAcc->getVecOp1(), RedTy,
- VPIRFlags::NonNegFlagsTy(MulAcc->isNonNeg()),
- MulAcc->getDebugLoc());
- else
- Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(),
- RedTy, {}, MulAcc->getDebugLoc());
- Op1->getDefiningRecipe()->insertBefore(MulAcc);
- }
- } else {
- // No extends in this MulAccRecipe.
- Op0 = MulAcc->getVecOp0();
- Op1 = MulAcc->getVecOp1();
- }
-
- std::array<VPValue *, 2> MulOps = {Op0, Op1};
- auto *Mul = new VPWidenRecipe(
- Instruction::Mul, ArrayRef(MulOps), MulAcc->hasNoUnsignedWrap(),
- MulAcc->hasNoSignedWrap(), MulAcc->getDebugLoc());
- Mul->insertBefore(MulAcc);
-
- auto *Red = new VPReductionRecipe(
- MulAcc->getRecurrenceKind(), FastMathFlags(), MulAcc->getChainOp(), Mul,
- MulAcc->getCondOp(), MulAcc->isOrdered(), MulAcc->getDebugLoc());
- Red->insertBefore(MulAcc);
-
- MulAcc->replaceAllUsesWith(Red);
- MulAcc->eraseFromParent();
-}
-
void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
Type &CanonicalIVTy) {
using namespace llvm::VPlanPatternMatch;
@@ -2666,12 +2589,10 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
ToRemove.push_back(VPI);
}
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (auto *ExtRed = dyn_cast<VPExtendedReductionRecipe>(&R)) {
- expandVPExtendedReduction(ExtRed);
- continue;
+ if (auto *Bundle = dyn_cast<VPBundleRecipe>(&R)) {
+ Bundle->unbundle();
+ Bundle->eraseFromParent();
}
- if (auto *MulAcc = dyn_cast<VPMulAccumulateReductionRecipe>(&R))
- expandVPMulAccumulateReduction(MulAcc);
}
}
@@ -2771,10 +2692,10 @@ void VPlanTransforms::handleUncountableEarlyExit(
}
/// This function tries convert extended in-loop reductions to
-/// VPExtendedReductionRecipe and clamp the \p Range if it is beneficial and
-/// valid. The created recipe must be lowered to concrete
+/// VPBundleRecipe and clamp the \p Range if it is beneficial and
+/// valid. The created recipe must be unbundled to its constituent
/// recipes before execution.
-static VPExtendedReductionRecipe *
+static VPBundleRecipe *
tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
VFRange &Range) {
using namespace VPlanPatternMatch;
@@ -2808,19 +2729,19 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
Instruction::CastOps::ZExt,
Ctx.Types.inferScalarType(A)))
- return new VPExtendedReductionRecipe(Red, cast<VPWidenCastRecipe>(VecOp));
+ return new VPBundleRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
return nullptr;
}
/// This function tries convert extended in-loop reductions to
-/// VPMulAccumulateReductionRecipe and clamp the \p Range if it is beneficial
-/// and valid. The created VPExtendedReductionRecipe must be lower to concrete
-/// recipes before execution. Patterns of MulAccumulateReduction:
+/// VPBundleRecipe and clamp the \p Range if it is beneficial
+/// and valid. The created VPBundleRecipe must be unbundled to its constituent
+/// recipes before execution. Patterns of the VPBundleRecipe:
/// reduce.add(mul(...)),
/// reduce.add(mul(ext(A), ext(B))),
/// reduce.add(ext(mul(ext(A), ext(B)))).
-static VPMulAccumulateReductionRecipe *
+static VPBundleRecipe *
tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
VPCostContext &Ctx, VFRange &Range) {
using namespace VPlanPatternMatch;
@@ -2876,12 +2797,13 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
Instruction::CastOps::ZExt,
- Mul, RecipeA, RecipeB, nullptr))
- return new VPMulAccumulateReductionRecipe(Red, Mul, RecipeA, RecipeB,
- RecipeA->getResultType());
+ Mul, RecipeA, RecipeB, nullptr)) {
+ return new VPBundleRecipe(RecipeA, RecipeB, Mul, Red);
+ }
// Match reduce.add(mul).
- if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
- return new VPMulAccumulateReductionRecipe(Red, Mul, RedTy);
+ if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) {
+ return new VPBundleRecipe(Mul, Red);
+ }
}
// Match reduce.add(ext(mul(ext(A), ext(B)))).
// All extend recipes must have same opcode or A == B
@@ -2898,9 +2820,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
Ext0->getOpcode() == Ext1->getOpcode() &&
IsMulAccValidAndClampRange(Ext0->getOpcode() ==
Instruction::CastOps::ZExt,
- Mul, Ext0, Ext1, Ext))
- return new VPMulAccumulateReductionRecipe(Red, Mul, Ext0, Ext1,
- Ext->getResultType());
+ Mul, Ext0, Ext1, Ext)) {
+ return new VPBundleRecipe(Ext0, Ext1, Mul, Ext, Red);
+ }
}
return nullptr;
}
@@ -2910,8 +2832,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
VPCostContext &Ctx,
VFRange &Range) {
- VPReductionRecipe *AbstractR = nullptr;
-
+ VPBundleRecipe *AbstractR = nullptr;
+ auto IP = std::next(Red->getIterator());
+ auto *VPBB = Red->getParent();
if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
AbstractR = MulAcc;
else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
@@ -2920,7 +2843,7 @@ static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
if (!AbstractR)
return;
- AbstractR->insertBefore(Red);
+ AbstractR->insertBefore(*VPBB, IP);
Red->replaceAllUsesWith(AbstractR);
}
@@ -2928,7 +2851,7 @@ void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
VFRange &Range) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
- for (VPRecipeBase &R : *VPBB) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index a0d3dc9b934cc..7246cb9a75ed8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -32,6 +32,7 @@ namespace llvm {
// Forward declarations.
class raw_ostream;
class Value;
+class VPBundleRecipe;
class VPDef;
struct VPDoubleValueDef;
class VPSlotTracker;
@@ -49,6 +50,7 @@ class VPValue {
friend struct VPDoubleValueDef;
friend class VPInterleaveRecipe;
friend class VPlan;
+ friend class VPBundleRecipe;
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
@@ -328,6 +330,7 @@ class VPDef {
/// type identification.
using VPRecipeTy = enum {
VPBranchOnMaskSC,
+ VPBundleSC,
VPDerivedIVSC,
VPExpandSCEVSC,
VPIRInstructionSC,
@@ -335,8 +338,6 @@ class VPDef {
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
- VPMulAccumulateReductionSC,
- VPExtendedReductionSC,
VPPartialReductionSC,
VPReplicateSC,
VPScalarIVStepsSC,
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
index f179a3ae04d23..212340fdcda26 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -115,20 +115,20 @@ define i64 @add_i16_i64(ptr nocapture readonly %x, i32 %n) #0 {
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483640
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
-; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]])
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
@@ -180,20 +180,20 @@ define i64 @add_i8_i64(ptr nocapture readonly %x, i32 %n) #0 {
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483632
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
-; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP1]])
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
@@ -1526,7 +1526,8 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP4]]
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
; CHECK-NEXT: [[TMP5]] = add i64 [[TMP3]], [[VEC_PHI]]
; CHECK-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 978f1b80d26da..3cd37851ec725 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -287,12 +287,12 @@ define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture re
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx>
; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]>
-; CHECK-NEXT: EXTENDED-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> zext to i64)
+; CHECK-NEXT: BUNDLE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> zext to i64)
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
@@ -332,7 +332,7 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
@@ -340,7 +340,7 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i
; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
-; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>)
+; CHECK-NEXT: BUNDLE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>)
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
@@ -382,7 +382,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
@@ -390,7 +390,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado
; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
-; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD1]]> sext to i64))
+; CHECK-NEXT: BUNDLE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD1]]> sext to i64))
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
>From 3c3f9e4ab79f1914923db6df0b2cc71c18c31e89 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 17 Jun 2025 11:56:40 +0100
Subject: [PATCH 2/7] !fixup address comments, thanks!
---
llvm/lib/Transforms/Vectorize/VPlan.h | 24 +++++++++++--------
.../Transforms/Vectorize/VPlanAnalysis.cpp | 6 ++++-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 ++-
.../LoopVectorize/ARM/mve-reductions.ll | 24 +++++++++----------
4 files changed, 33 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 256706deb0977..00412fb70aa0a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2735,10 +2735,10 @@ class VPBundleRecipe : public VPSingleDefRecipe {
BundleType(BundleType) {
// Bundle up the operand recipes.
SmallPtrSet<VPUser *, 4> BundledUsers;
- for (auto *R : ToBundle)
+ for (auto *R : BundledOps)
BundledUsers.insert(R);
- // Recipes in the bundle, expect the last one, must only be used inside the
+ // Recipes in the bundle, except the last one, must only be used inside the
// bundle. If there other external users, clone the recipes for the bundle.
for (const auto &[Idx, R] : enumerate(drop_end(ToBundle))) {
if (all_of(R->users(), [&BundledUsers](VPUser *U) {
@@ -2748,14 +2748,14 @@ class VPBundleRecipe : public VPSingleDefRecipe {
R->removeFromParent();
continue;
}
- // There users external to the bundle. Clone the recipe for use in the
+ // The users external to the bundle. Clone the recipe for use in the
// bundle and update all its in-bundle users.
- this->BundledOps[Idx] = R->clone();
- BundledUsers.insert(this->BundledOps[Idx]);
- R->replaceUsesWithIf(this->BundledOps[Idx],
- [&BundledUsers](VPUser &U, unsigned) {
- return BundledUsers.contains(&U);
- });
+ VPSingleDefRecipe *Copy = R->clone();
+ BundledOps[Idx] = Copy;
+ BundledUsers.insert(Copy);
+ R->replaceUsesWithIf(Copy, [&BundledUsers](VPUser &U, unsigned) {
+ return BundledUsers.contains(&U);
+ });
}
BundledOps.back()->removeFromParent();
@@ -2763,7 +2763,7 @@ class VPBundleRecipe : public VPSingleDefRecipe {
// create new temporary VPValues for all operands not defined by recipe in
// the bundle. The original operands are added as operands of the
// VPBundleRecipe.
- for (auto *R : this->BundledOps) {
+ for (auto *R : BundledOps) {
for (const auto &[Idx, Op] : enumerate(R->operands())) {
auto *Def = Op->getDefiningRecipe();
if (Def && BundledUsers.contains(Def))
@@ -2802,6 +2802,7 @@ class VPBundleRecipe : public VPSingleDefRecipe {
VP_CLASSOF_IMPL(VPDef::VPBundleSC)
VPBundleRecipe *clone() override {
+ assert(!BundledOps.empty() && "empty bundles should be removed");
return new VPBundleRecipe(BundleType, BundledOps);
}
@@ -2809,6 +2810,9 @@ class VPBundleRecipe : public VPSingleDefRecipe {
/// recipe.
VPSingleDefRecipe *getResultOp() const { return BundledOps.back(); }
+ /// Insert the bundled recipes back into the VPlan, directly before the
+ /// current recipe. Leaves the bundle recipe empty and the recipe must be
+ /// removed before codegen.
void unbundle();
/// Generate the extraction of the appropriate bit from the block mask and the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index c8336e7b3f92c..1e2961c5beb56 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -268,7 +268,11 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
Type *ResultTy =
TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
.Case<VPBundleRecipe>([this](const auto *R) {
- return inferScalarType(R->getOperand(R->getNumOperands() - 2));
+ unsigned RdxOpIdxOffset =
+ cast<VPReductionRecipe>(R->getResultOp())->isConditional() ? 2
+ : 1;
+ return inferScalarType(
+ R->getOperand(R->getNumOperands() - RdxOpIdxOffset));
})
.Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 392d6c3d32c87..c6bcb1491ee4f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2590,6 +2590,7 @@ void VPBundleRecipe::unbundle() {
BundledOps.size() == 5) {
// Note that we will drop the extend after mul which transforms
// reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
+ // TODO: This transform should be done separately from bundling/unbundling.
auto *Ext0 = cast<VPWidenCastRecipe>(BundledOps[0]);
auto *Ext1 = cast<VPWidenCastRecipe>(BundledOps[1]);
auto *Ext2 = cast<VPWidenCastRecipe>(BundledOps[3]);
@@ -2602,7 +2603,7 @@ void VPBundleRecipe::unbundle() {
if (Ext0 != Ext1) {
Op1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
Ext2->getResultType(), *Ext1, getDebugLoc());
- Op1->insertBefore(Ext0);
+ Op1->insertBefore(Ext1);
}
auto *Mul = cast<VPWidenRecipe>(BundledOps[2]);
auto *Red = cast<VPReductionRecipe>(BundledOps[4]);
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
index 212340fdcda26..e8af144498659 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -115,20 +115,20 @@ define i64 @add_i16_i64(ptr nocapture readonly %x, i32 %n) #0 {
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483640
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]])
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
+; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
@@ -180,20 +180,20 @@ define i64 @add_i8_i64(ptr nocapture readonly %x, i32 %n) #0 {
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483632
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
-; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP1]])
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
>From 1b7bf4b989c7e0a775bf55d85d1ce4223ce21a74 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 17 Jun 2025 12:17:22 +0100
Subject: [PATCH 3/7] !fixup fix formatting
---
llvm/lib/Transforms/Vectorize/VPlan.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 00412fb70aa0a..5f560530400ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2428,6 +2428,7 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
}
setUnderlyingValue(I);
}
+
public:
VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
>From ac1a2dca83311b99441d697c322ccdbc46923be2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 20 Jun 2025 08:08:07 +0100
Subject: [PATCH 4/7] !fixup deep-clone whole bundle
---
llvm/lib/Transforms/Vectorize/VPlan.h | 85 +++++++----------
.../Transforms/Vectorize/VPlanAnalysis.cpp | 5 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 94 ++++++++++++++-----
3 files changed, 108 insertions(+), 76 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 39e4edeab869b..86b055f682052 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2697,7 +2697,10 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
/// A recipe to combine multiple recipes into a 'bundle' recipe, which should be
/// considered as single entity for cost-modeling and transforms. The recipe
/// needs to be 'unbundled', i.e. replaced by its individual recipes before
-/// execute.
+/// execute. The bundled recipes are completely connected from the def-use graph
+/// outside the bundled recipes. Operands not defined by recipes in the bundle
+/// are added as operands of the VPBundleRecipe and the users of the result
+/// recipe must be updated to use the VPBundleRecipe.
class VPBundleRecipe : public VPSingleDefRecipe {
enum class BundleTypes {
ExtendedReduction,
@@ -2705,7 +2708,7 @@ class VPBundleRecipe : public VPSingleDefRecipe {
};
/// Recipes bundled together in this VPBundleRecipe.
- SmallVector<VPSingleDefRecipe *> BundledOps;
+ SmallVector<VPSingleDefRecipe *> BundledRecipes;
/// Temporary VPValues used for external operands of the bundle, i.e. operands
/// not defined by recipes in the bundle.
@@ -2714,69 +2717,39 @@ class VPBundleRecipe : public VPSingleDefRecipe {
/// Type of the bundle.
BundleTypes BundleType;
- VPBundleRecipe(BundleTypes BundleType, ArrayRef<VPSingleDefRecipe *> ToBundle)
- : VPSingleDefRecipe(VPDef::VPBundleSC, {}, {}), BundledOps(ToBundle),
+ VPBundleRecipe(BundleTypes BundleType, ArrayRef<VPSingleDefRecipe *> ToBundle,
+ ArrayRef<VPValue *> Operands)
+ : VPSingleDefRecipe(VPDef::VPBundleSC, {}, {}), BundledRecipes(ToBundle),
BundleType(BundleType) {
- // Bundle up the operand recipes.
- SmallPtrSet<VPUser *, 4> BundledUsers;
- for (auto *R : BundledOps)
- BundledUsers.insert(R);
-
- // Recipes in the bundle, except the last one, must only be used inside the
- // bundle. If there other external users, clone the recipes for the bundle.
- for (const auto &[Idx, R] : enumerate(drop_end(ToBundle))) {
- if (all_of(R->users(), [&BundledUsers](VPUser *U) {
- return BundledUsers.contains(U);
- })) {
- if (R->getParent())
- R->removeFromParent();
- continue;
- }
- // The users external to the bundle. Clone the recipe for use in the
- // bundle and update all its in-bundle users.
- VPSingleDefRecipe *Copy = R->clone();
- BundledOps[Idx] = Copy;
- BundledUsers.insert(Copy);
- R->replaceUsesWithIf(Copy, [&BundledUsers](VPUser &U, unsigned) {
- return BundledUsers.contains(&U);
- });
- }
- BundledOps.back()->removeFromParent();
-
- // Internalize all external operands to the bundled operations. To do so,
- // create new temporary VPValues for all operands not defined by recipe in
- // the bundle. The original operands are added as operands of the
- // VPBundleRecipe.
- for (auto *R : BundledOps) {
- for (const auto &[Idx, Op] : enumerate(R->operands())) {
- auto *Def = Op->getDefiningRecipe();
- if (Def && BundledUsers.contains(Def))
- continue;
- addOperand(Op);
- TmpValues.push_back(new VPValue());
- R->setOperand(Idx, TmpValues.back());
- }
- }
+ bundle(Operands);
}
+ /// Internalize recipes in BundledRecipes External operands (i.e. not defined
+ /// by another recipe in the bundle) are replaced by temporary VPValues and
+ /// the original operands are transferred to the VPBundleRecipe itself. Clone
+ /// recipes as needed to ensure they are only used by other recipes in the
+ /// bundle. If \p Operands is not empty, use it as operands for the new
+ /// VPBundleRecipe (used when cloning the recipe).
+ void bundle(ArrayRef<VPValue *> Operands);
+
public:
VPBundleRecipe(VPWidenCastRecipe *Ext, VPReductionRecipe *Red)
- : VPBundleRecipe(BundleTypes::ExtendedReduction, {Ext, Red}) {}
+ : VPBundleRecipe(BundleTypes::ExtendedReduction, {Ext, Red}, {}) {}
VPBundleRecipe(VPWidenRecipe *Mul, VPReductionRecipe *Red)
- : VPBundleRecipe(BundleTypes::MulAccumulateReduction, {Mul, Red}) {}
+ : VPBundleRecipe(BundleTypes::MulAccumulateReduction, {Mul, Red}, {}) {}
VPBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
VPWidenRecipe *Mul, VPReductionRecipe *Red)
: VPBundleRecipe(BundleTypes::MulAccumulateReduction,
- {Ext0, Ext1, Mul, Red}) {}
+ {Ext0, Ext1, Mul, Red}, {}) {}
VPBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
VPWidenRecipe *Mul, VPWidenCastRecipe *Ext2,
VPReductionRecipe *Red)
: VPBundleRecipe(BundleTypes::MulAccumulateReduction,
- {Ext0, Ext1, Mul, Ext2, Red}) {}
+ {Ext0, Ext1, Mul, Ext2, Red}, {}) {}
~VPBundleRecipe() override {
SmallPtrSet<VPRecipeBase *, 4> Seen;
- for (auto *R : reverse(BundledOps))
+ for (auto *R : reverse(BundledRecipes))
if (Seen.insert(R).second)
delete R;
for (VPValue *T : TmpValues)
@@ -2786,13 +2759,21 @@ class VPBundleRecipe : public VPSingleDefRecipe {
VP_CLASSOF_IMPL(VPDef::VPBundleSC)
VPBundleRecipe *clone() override {
- assert(!BundledOps.empty() && "empty bundles should be removed");
- return new VPBundleRecipe(BundleType, BundledOps);
+ assert(!BundledRecipes.empty() && "empty bundles should be removed");
+ SmallVector<VPSingleDefRecipe *> NewBundledRecipes;
+ for (auto *R : BundledRecipes)
+ NewBundledRecipes.push_back(R->clone());
+ for (auto *New : NewBundledRecipes) {
+ for (const auto &[Idx, Old] : enumerate(BundledRecipes)) {
+ New->replaceUsesOfWith(Old, NewBundledRecipes[Idx]);
+ }
+ }
+ return new VPBundleRecipe(BundleType, NewBundledRecipes, operands());
}
/// Return the VPSingleDefRecipe producing the final result of the bundled
/// recipe.
- VPSingleDefRecipe *getResultOp() const { return BundledOps.back(); }
+ VPSingleDefRecipe *getResultRecipe() const { return BundledRecipes.back(); }
/// Insert the bundled recipes back into the VPlan, directly before the
/// current recipe. Leaves the bundle recipe empty and the recipe must be
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 1e2961c5beb56..21f90d96ed9ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -269,8 +269,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
.Case<VPBundleRecipe>([this](const auto *R) {
unsigned RdxOpIdxOffset =
- cast<VPReductionRecipe>(R->getResultOp())->isConditional() ? 2
- : 1;
+ cast<VPReductionRecipe>(R->getResultRecipe())->isConditional()
+ ? 2
+ : 1;
return inferScalarType(
R->getOperand(R->getNumOperands() - RdxOpIdxOffset));
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8931fcb80d51e..4d6ac4e5a1205 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2440,24 +2440,74 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
Ctx.CostKind);
}
+void VPBundleRecipe::bundle(ArrayRef<VPValue *> Operands) {
+ assert(!BundledRecipes.empty() && "Nothing to bundle?");
+
+ // Bundle up the operand recipes.
+ SmallPtrSet<VPUser *, 4> BundledUsers;
+ for (auto *R : BundledRecipes)
+ BundledUsers.insert(R);
+
+ // Recipes in the bundle, except the last one, must only be used inside the
+ // bundle. If there other external users, clone the recipes for the bundle.
+ for (unsigned Idx = 0; Idx != BundledRecipes.size() - 1; ++Idx) {
+ VPSingleDefRecipe *R = BundledRecipes[Idx];
+ if (all_of(R->users(), [&BundledUsers](VPUser *U) {
+ return BundledUsers.contains(U);
+ })) {
+ if (R->getParent())
+ R->removeFromParent();
+ continue;
+ }
+ // The users external to the bundle. Clone the recipe for use in the
+ // bundle and update all its in-bundle users.
+ VPSingleDefRecipe *Copy = R->clone();
+ BundledRecipes[Idx] = Copy;
+ BundledUsers.insert(Copy);
+ R->replaceUsesWithIf(Copy, [&BundledUsers](VPUser &U, unsigned) {
+ return BundledUsers.contains(&U);
+ });
+ }
+ if (BundledRecipes.back()->getParent())
+ BundledRecipes.back()->removeFromParent();
+
+ // Internalize all external operands to the bundled operations. To do so,
+ // create new temporary VPValues for all operands not defined by recipe in
+ // the bundle. The original operands are added as operands of the
+ // VPBundleRecipe.
+ for (auto *R : BundledRecipes) {
+ for (const auto &[Idx, Op] : enumerate(R->operands())) {
+ auto *Def = Op->getDefiningRecipe();
+ if (Def && BundledUsers.contains(Def))
+ continue;
+ if (Operands.empty())
+ addOperand(Op);
+ else
+ addOperand(Operands[TmpValues.size()]);
+ TmpValues.push_back(new VPValue());
+ R->setOperand(Idx, TmpValues.back());
+ }
+ }
+}
+
void VPBundleRecipe::unbundle() {
- for (auto *Op : BundledOps)
- if (!Op->getParent())
- Op->insertBefore(this);
+ for (auto *R : BundledRecipes)
+ if (!R->getParent())
+ R->insertBefore(this);
for (const auto &[Idx, Op] : enumerate(operands()))
TmpValues[Idx]->replaceAllUsesWith(Op);
- replaceAllUsesWith(getResultOp());
+ replaceAllUsesWith(getResultRecipe());
if (BundleType == BundleTypes::MulAccumulateReduction &&
- BundledOps.size() == 5) {
+ BundledRecipes.size() == 5) {
// Note that we will drop the extend after mul which transforms
// reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
// TODO: This transform should be done separately from bundling/unbundling.
- auto *Ext0 = cast<VPWidenCastRecipe>(BundledOps[0]);
- auto *Ext1 = cast<VPWidenCastRecipe>(BundledOps[1]);
- auto *Ext2 = cast<VPWidenCastRecipe>(BundledOps[3]);
+ auto *Ext0 = cast<VPWidenCastRecipe>(BundledRecipes[0]);
+ auto *Ext1 = cast<VPWidenCastRecipe>(BundledRecipes[1]);
+ auto *Ext2 = cast<VPWidenCastRecipe>(BundledRecipes[3]);
auto *Op0 =
new VPWidenCastRecipe(Ext0->getOpcode(), Ext0->getOperand(0),
Ext2->getResultType(), *Ext0, getDebugLoc());
@@ -2469,8 +2519,8 @@ void VPBundleRecipe::unbundle() {
Ext2->getResultType(), *Ext1, getDebugLoc());
Op1->insertBefore(Ext1);
}
- auto *Mul = cast<VPWidenRecipe>(BundledOps[2]);
- auto *Red = cast<VPReductionRecipe>(BundledOps[4]);
+ auto *Mul = cast<VPWidenRecipe>(BundledRecipes[2]);
+ auto *Red = cast<VPReductionRecipe>(BundledRecipes[4]);
Mul->setOperand(0, Op0);
Mul->setOperand(1, Op1);
Red->setOperand(1, Mul);
@@ -2479,7 +2529,7 @@ void VPBundleRecipe::unbundle() {
if (Ext0 != Ext1)
Ext1->eraseFromParent();
}
- BundledOps.clear();
+ BundledRecipes.clear();
}
InstructionCost VPBundleRecipe::computeCost(ElementCount VF,
@@ -2492,17 +2542,17 @@ InstructionCost VPBundleRecipe::computeCost(ElementCount VF,
switch (BundleType) {
case BundleTypes::ExtendedReduction: {
unsigned Opcode = RecurrenceDescriptor::getOpcode(
- cast<VPReductionRecipe>(BundledOps[1])->getRecurrenceKind());
+ cast<VPReductionRecipe>(BundledRecipes[1])->getRecurrenceKind());
return Ctx.TTI.getExtendedReductionCost(
Opcode,
- cast<VPWidenCastRecipe>(BundledOps.front())->getOpcode() ==
+ cast<VPWidenCastRecipe>(BundledRecipes.front())->getOpcode() ==
Instruction::ZExt,
RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
}
case BundleTypes::MulAccumulateReduction:
return Ctx.TTI.getMulAccReductionCost(
- BundledOps.size() > 2
- ? cast<VPWidenCastRecipe>(BundledOps.front())->getOpcode() ==
+ BundledRecipes.size() > 2
+ ? cast<VPWidenCastRecipe>(BundledRecipes.front())->getOpcode() ==
Instruction::ZExt
: false,
RedTy, SrcVecTy, Ctx.CostKind);
@@ -2516,7 +2566,7 @@ void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
O << Indent << "BUNDLE ";
printAsOperand(O, SlotTracker);
O << " = ";
- auto *Red = cast<VPReductionRecipe>(BundledOps.back());
+ auto *Red = cast<VPReductionRecipe>(BundledRecipes.back());
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
switch (BundleType) {
@@ -2527,7 +2577,7 @@ void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
getOperand(0)->printAsOperand(O, SlotTracker);
Red->printFlags(O);
- auto *Ext0 = cast<VPWidenCastRecipe>(BundledOps[0]);
+ auto *Ext0 = cast<VPWidenCastRecipe>(BundledRecipes[0]);
O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
<< *Ext0->getResultType();
if (Red->isConditional()) {
@@ -2545,16 +2595,16 @@ void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
<< " (";
O << "mul";
- auto *Mul = cast<VPWidenRecipe>(BundledOps.size() == 2 ? BundledOps[0]
- : BundledOps[2]);
+ auto *Mul = cast<VPWidenRecipe>(
+ BundledRecipes.size() == 2 ? BundledRecipes[0] : BundledRecipes[2]);
Mul->printFlags(O);
- bool IsExtended = BundledOps.size() > 2;
+ bool IsExtended = BundledRecipes.size() > 2;
if (IsExtended)
O << "(";
getOperand(0)->printAsOperand(O, SlotTracker);
if (IsExtended) {
auto *Ext0 = cast<VPWidenCastRecipe>(
- BundledOps.size() == 5 ? BundledOps[3] : BundledOps[0]);
+ BundledRecipes.size() == 5 ? BundledRecipes[3] : BundledRecipes[0]);
O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
<< *Ext0->getResultType() << "), (";
} else {
@@ -2563,7 +2613,7 @@ void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
getOperand(1)->printAsOperand(O, SlotTracker);
if (IsExtended) {
auto *Ext1 = cast<VPWidenCastRecipe>(
- BundledOps.size() == 5 ? BundledOps[3] : BundledOps[1]);
+ BundledRecipes.size() == 5 ? BundledRecipes[3] : BundledRecipes[1]);
O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
<< *Ext1->getResultType() << ")";
}
>From d87843c4ccd68d293ed52c084893289e81a97dcf Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 24 Jun 2025 21:40:16 +0100
Subject: [PATCH 5/7] !fixup adjust assertion message
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 44f5bd74e3b1e..263d719c281f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2598,7 +2598,7 @@ InstructionCost VPBundleRecipe::computeCost(ElementCount VF,
auto *SrcVecTy = cast<VectorType>(
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
assert(RedTy->isIntegerTy() &&
- "ExtendedReduction only support integer type currently.");
+ "VPBundleRecipe only supports integer types currently.");
switch (BundleType) {
case BundleTypes::ExtendedReduction: {
unsigned Opcode = RecurrenceDescriptor::getOpcode(
>From 285ac3469570ab3a456ca2a956c45e2720526591 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 26 Jun 2025 15:09:46 +0100
Subject: [PATCH 6/7] !fixup address latest comments, thanks
---
llvm/lib/Transforms/Vectorize/VPlan.h | 113 +++++++-------
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 147 ++++++++----------
.../Transforms/Vectorize/VPlanTransforms.cpp | 23 ++-
.../LoopVectorize/ARM/mve-reductions.ll | 3 +-
4 files changed, 148 insertions(+), 138 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a2c6a09fabc83..7ef42b20f2e25 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2707,80 +2707,83 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
};
/// A recipe to combine multiple recipes into a 'bundle' recipe, which should be
-/// considered as single entity for cost-modeling and transforms. The recipe
-/// needs to be 'unbundled', i.e. replaced by its individual recipes before
-/// execute. The bundled recipes are completely connected from the def-use graph
-/// outside the bundled recipes. Operands not defined by recipes in the bundle
-/// are added as operands of the VPBundleRecipe and the users of the result
-/// recipe must be updated to use the VPBundleRecipe.
-class VPBundleRecipe : public VPSingleDefRecipe {
- enum class BundleTypes {
- ExtendedReduction,
- MulAccumulateReduction,
- };
-
- /// Recipes bundled together in this VPBundleRecipe.
+/// considered a single entity for cost-modeling and transforms. The recipe
+/// needs to be 'unbundled', i.e. replaced by its bundled recipes before
+/// execute. The bundled recipes are completely disconnected from the def-use
+/// graph of other, non-bundled recipes. Def-use edges between pairs of bundled
+/// recipes remain intact, whereas every edge between a bundled and a
+/// non-bundled recipe is elevated to connect the non-bundled recipe with the
+/// VPExpression itself.
+class VPExpression : public VPSingleDefRecipe {
+ /// Recipes bundled together in this VPExpression.
SmallVector<VPSingleDefRecipe *> BundledRecipes;
/// Temporary VPValues used for external operands of the bundle, i.e. operands
/// not defined by recipes in the bundle.
- SmallVector<VPValue *> TmpValues;
+ SmallVector<VPValue *> BundleLiveInPlaceholders;
+
+ enum class BundleTypes {
+ /// Represents an inloop extended reduction operation, performing a
+ /// reduction on a extended vector operand into a scalar value, and adding
+ /// the result to a chain.
+ ExtendedReduction,
+ /// Represent an inloop multiply-accumulate reduction, multiplying the
+ /// extended vector operands, performing a reduction.add on the result, and
+ /// adding the scalar result to a chain.
+ ExtMulAccumulateReduction,
+ /// Represent an inloop multiply-accumulate reduction, multiplying the
+ /// vector operands, performing a reduction.add on the result, and adding
+ /// the scalar result to a chain.
+ MulAccumulateReduction,
+ };
/// Type of the bundle.
BundleTypes BundleType;
- VPBundleRecipe(BundleTypes BundleType, ArrayRef<VPSingleDefRecipe *> ToBundle,
- ArrayRef<VPValue *> Operands)
- : VPSingleDefRecipe(VPDef::VPBundleSC, {}, {}), BundledRecipes(ToBundle),
- BundleType(BundleType) {
- bundle(Operands);
- }
-
- /// Internalize recipes in BundledRecipes External operands (i.e. not defined
- /// by another recipe in the bundle) are replaced by temporary VPValues and
- /// the original operands are transferred to the VPBundleRecipe itself. Clone
- /// recipes as needed to ensure they are only used by other recipes in the
- /// bundle. If \p Operands is not empty, use it as operands for the new
- /// VPBundleRecipe (used when cloning the recipe).
- void bundle(ArrayRef<VPValue *> Operands);
+ /// Construct a new VPExpression by internalizing recipes in \p
+ /// BundledRecipes. External operands (i.e. not defined by another recipe in
+ /// the bundle) are replaced by temporary VPValues and the original operands
+ /// are transferred to the VPExpression itself. Clone recipes as needed
+ /// (excluding last) to ensure they are only used by other recipes in the
+ /// bundle.
+ VPExpression(BundleTypes BundleType, ArrayRef<VPSingleDefRecipe *> ToBundle);
public:
- VPBundleRecipe(VPWidenCastRecipe *Ext, VPReductionRecipe *Red)
- : VPBundleRecipe(BundleTypes::ExtendedReduction, {Ext, Red}, {}) {}
- VPBundleRecipe(VPWidenRecipe *Mul, VPReductionRecipe *Red)
- : VPBundleRecipe(BundleTypes::MulAccumulateReduction, {Mul, Red}, {}) {}
- VPBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
- VPWidenRecipe *Mul, VPReductionRecipe *Red)
- : VPBundleRecipe(BundleTypes::MulAccumulateReduction,
- {Ext0, Ext1, Mul, Red}, {}) {}
- VPBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
- VPWidenRecipe *Mul, VPWidenCastRecipe *Ext2,
- VPReductionRecipe *Red)
- : VPBundleRecipe(BundleTypes::MulAccumulateReduction,
- {Ext0, Ext1, Mul, Ext2, Red}, {}) {}
-
- ~VPBundleRecipe() override {
+ VPExpression(VPWidenCastRecipe *Ext, VPReductionRecipe *Red)
+ : VPExpression(BundleTypes::ExtendedReduction, {Ext, Red}) {}
+ VPExpression(VPWidenRecipe *Mul, VPReductionRecipe *Red)
+ : VPExpression(BundleTypes::MulAccumulateReduction, {Mul, Red}) {}
+ VPExpression(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+ VPWidenRecipe *Mul, VPReductionRecipe *Red)
+ : VPExpression(BundleTypes::ExtMulAccumulateReduction,
+ {Ext0, Ext1, Mul, Red}) {}
+
+ ~VPExpression() override {
SmallPtrSet<VPRecipeBase *, 4> Seen;
for (auto *R : reverse(BundledRecipes))
if (Seen.insert(R).second)
delete R;
- for (VPValue *T : TmpValues)
+ for (VPValue *T : BundleLiveInPlaceholders)
delete T;
}
VP_CLASSOF_IMPL(VPDef::VPBundleSC)
- VPBundleRecipe *clone() override {
+ VPExpression *clone() override {
assert(!BundledRecipes.empty() && "empty bundles should be removed");
SmallVector<VPSingleDefRecipe *> NewBundledRecipes;
for (auto *R : BundledRecipes)
NewBundledRecipes.push_back(R->clone());
for (auto *New : NewBundledRecipes) {
- for (const auto &[Idx, Old] : enumerate(BundledRecipes)) {
+ for (const auto &[Idx, Old] : enumerate(BundledRecipes))
New->replaceUsesOfWith(Old, NewBundledRecipes[Idx]);
- }
+ // Update placeholder operands in the cloned recipe to use the external
+ // operands, to be internalized when the cloned bundle is constructed.
+ for (const auto &[Placeholder, OutsideOp] :
+ zip(BundleLiveInPlaceholders, operands()))
+ New->replaceUsesOfWith(Placeholder, OutsideOp);
}
- return new VPBundleRecipe(BundleType, NewBundledRecipes, operands());
+ return new VPExpression(BundleType, NewBundledRecipes);
}
/// Return the VPSingleDefRecipe producing the final result of the bundled
@@ -2788,12 +2791,11 @@ class VPBundleRecipe : public VPSingleDefRecipe {
VPSingleDefRecipe *getResultRecipe() const { return BundledRecipes.back(); }
/// Insert the bundled recipes back into the VPlan, directly before the
- /// current recipe. Leaves the bundle recipe empty and the recipe must be
- /// removed before codegen.
+ /// current recipe. Leaves the bundle recipe empty, which must be removed
+ /// before codegen.
void unbundle();
- /// Generate the extraction of the appropriate bit from the block mask and the
- /// conditional branch.
+ /// Method for generating code, must not be called as this recipe is abstract.
void execute(VPTransformState &State) override {
llvm_unreachable("recipe must be removed before execute");
}
@@ -2806,6 +2808,13 @@ class VPBundleRecipe : public VPSingleDefRecipe {
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ /// Returns true if this bundle contains recipes that may read from or write
+ /// to memory.
+ bool mayReadOrWriteMemory() const;
+
+ /// Returns true if this bundle contains recipes that may have side effects.
+ bool mayHaveSideEffects() const;
};
/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 263d719c281f0..3c2761f566d6a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -49,6 +49,8 @@ using VectorParts = SmallVector<Value *, 2>;
bool VPRecipeBase::mayWriteToMemory() const {
switch (getVPDefID()) {
+ case VPBundleSC:
+ return cast<VPBundleRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
case VPInterleaveSC:
@@ -97,6 +99,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
bool VPRecipeBase::mayReadFromMemory() const {
switch (getVPDefID()) {
+ case VPBundleSC:
+ return cast<VPBundleRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
case VPWidenLoadEVLSC:
@@ -143,6 +147,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
bool VPRecipeBase::mayHaveSideEffects() const {
switch (getVPDefID()) {
+ case VPBundleSC:
+ return cast<VPBundleRecipe>(this)->mayHaveSideEffects();
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPPredInstPHISC:
@@ -2500,95 +2506,65 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
Ctx.CostKind);
}
-void VPBundleRecipe::bundle(ArrayRef<VPValue *> Operands) {
+VPBundleRecipe::VPBundleRecipe(BundleTypes BundleType,
+ ArrayRef<VPSingleDefRecipe *> ToBundle)
+ : VPSingleDefRecipe(VPDef::VPBundleSC, {}, {}),
+ BundledRecipes(
+ SetVector<VPSingleDefRecipe *>(ToBundle.begin(), ToBundle.end())
+ .takeVector()),
+ BundleType(BundleType) {
assert(!BundledRecipes.empty() && "Nothing to bundle?");
- // Bundle up the operand recipes.
- SmallPtrSet<VPUser *, 4> BundledUsers;
+ // Maintain a copy of the bundled recipes as a set of users.
+ SmallPtrSet<VPUser *, 4> BundledRecipesAsSetOfUsers;
for (auto *R : BundledRecipes)
- BundledUsers.insert(R);
-
- // Recipes in the bundle, except the last one, must only be used inside the
- // bundle. If there other external users, clone the recipes for the bundle.
- for (unsigned Idx = 0; Idx != BundledRecipes.size() - 1; ++Idx) {
- VPSingleDefRecipe *R = BundledRecipes[Idx];
- if (all_of(R->users(), [&BundledUsers](VPUser *U) {
- return BundledUsers.contains(U);
+ BundledRecipesAsSetOfUsers.insert(R);
+
+ // Recipes in the bundle, except the last one, must only be used by (other)
+ // recipes inside the bundle. If there are other users, external to the
+ // bundle, use a clone of the recipe for external users.
+ for (VPSingleDefRecipe *R : BundledRecipes) {
+ if (R != BundledRecipes.back() &&
+ any_of(R->users(), [&BundledRecipesAsSetOfUsers](VPUser *U) {
+ return !BundledRecipesAsSetOfUsers.contains(U);
})) {
- if (R->getParent())
- R->removeFromParent();
- continue;
+ // There are users outside of the bundle. Clone the recipe and use the
+ // clone those external users.
+ VPSingleDefRecipe *CopyForExtUsers = R->clone();
+ R->replaceUsesWithIf(CopyForExtUsers,
+ [&BundledRecipesAsSetOfUsers](VPUser &U, unsigned) {
+ return !BundledRecipesAsSetOfUsers.contains(&U);
+ });
+ CopyForExtUsers->insertBefore(R);
}
- // The users external to the bundle. Clone the recipe for use in the
- // bundle and update all its in-bundle users.
- VPSingleDefRecipe *Copy = R->clone();
- BundledRecipes[Idx] = Copy;
- BundledUsers.insert(Copy);
- R->replaceUsesWithIf(Copy, [&BundledUsers](VPUser &U, unsigned) {
- return BundledUsers.contains(&U);
- });
- }
- if (BundledRecipes.back()->getParent())
- BundledRecipes.back()->removeFromParent();
-
- // Internalize all external operands to the bundled operations. To do so,
- // create new temporary VPValues for all operands not defined by recipe in
+ if (R->getParent())
+ R->removeFromParent();
+ }
+
+ // Internalize all external operands to the bundled recipes. To do so,
+ // create new temporary VPValues for all operands defined by a recipe outside
// the bundle. The original operands are added as operands of the
- // VPBundleRecipe.
+ // VPBundleRecipe itself.
for (auto *R : BundledRecipes) {
for (const auto &[Idx, Op] : enumerate(R->operands())) {
auto *Def = Op->getDefiningRecipe();
- if (Def && BundledUsers.contains(Def))
+ if (Def && BundledRecipesAsSetOfUsers.contains(Def))
continue;
- if (Operands.empty())
- addOperand(Op);
- else
- addOperand(Operands[TmpValues.size()]);
- TmpValues.push_back(new VPValue());
- R->setOperand(Idx, TmpValues.back());
+ addOperand(Op);
+ BundleLiveInPlaceholders.push_back(new VPValue());
+ R->setOperand(Idx, BundleLiveInPlaceholders.back());
}
}
}
void VPBundleRecipe::unbundle() {
for (auto *R : BundledRecipes)
- if (!R->getParent())
R->insertBefore(this);
for (const auto &[Idx, Op] : enumerate(operands()))
- TmpValues[Idx]->replaceAllUsesWith(Op);
+ BundleLiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
replaceAllUsesWith(getResultRecipe());
-
- if (BundleType == BundleTypes::MulAccumulateReduction &&
- BundledRecipes.size() == 5) {
- // Note that we will drop the extend after mul which transforms
- // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
- // TODO: This transform should be done separately from bundling/unbundling.
- auto *Ext0 = cast<VPWidenCastRecipe>(BundledRecipes[0]);
- auto *Ext1 = cast<VPWidenCastRecipe>(BundledRecipes[1]);
- auto *Ext2 = cast<VPWidenCastRecipe>(BundledRecipes[3]);
- auto *Op0 =
- new VPWidenCastRecipe(Ext0->getOpcode(), Ext0->getOperand(0),
- Ext2->getResultType(), *Ext0, getDebugLoc());
- Op0->insertBefore(Ext0);
-
- VPSingleDefRecipe *Op1 = Op0;
- if (Ext0 != Ext1) {
- Op1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
- Ext2->getResultType(), *Ext1, getDebugLoc());
- Op1->insertBefore(Ext1);
- }
- auto *Mul = cast<VPWidenRecipe>(BundledRecipes[2]);
- auto *Red = cast<VPReductionRecipe>(BundledRecipes[4]);
- Mul->setOperand(0, Op0);
- Mul->setOperand(1, Op1);
- Red->setOperand(1, Mul);
- Ext0->eraseFromParent();
- Ext2->eraseFromParent();
- if (Ext0 != Ext1)
- Ext1->eraseFromParent();
- }
BundledRecipes.clear();
}
@@ -2610,15 +2586,27 @@ InstructionCost VPBundleRecipe::computeCost(ElementCount VF,
RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
}
case BundleTypes::MulAccumulateReduction:
+ return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind);
+
+ case BundleTypes::ExtMulAccumulateReduction:
return Ctx.TTI.getMulAccReductionCost(
- BundledRecipes.size() > 2
- ? cast<VPWidenCastRecipe>(BundledRecipes.front())->getOpcode() ==
- Instruction::ZExt
- : false,
+ cast<VPWidenCastRecipe>(BundledRecipes.front())->getOpcode() ==
+ Instruction::ZExt,
RedTy, SrcVecTy, Ctx.CostKind);
}
}
+bool VPBundleRecipe::mayReadOrWriteMemory() const {
+ return any_of(BundledRecipes, [](VPSingleDefRecipe *R) {
+ return R->mayReadFromMemory() || R->mayWriteToMemory();
+ });
+}
+
+bool VPBundleRecipe::mayHaveSideEffects() const {
+ return any_of(BundledRecipes,
+ [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); });
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
@@ -2647,7 +2635,8 @@ void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
O << ")";
break;
}
- case BundleTypes::MulAccumulateReduction: {
+ case BundleTypes::MulAccumulateReduction:
+ case BundleTypes::ExtMulAccumulateReduction: {
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
O << " + ";
O << "reduce."
@@ -2655,16 +2644,15 @@ void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
<< " (";
O << "mul";
- auto *Mul = cast<VPWidenRecipe>(
- BundledRecipes.size() == 2 ? BundledRecipes[0] : BundledRecipes[2]);
+ bool IsExtended = BundleType == BundleTypes::ExtMulAccumulateReduction;
+ auto *Mul =
+ cast<VPWidenRecipe>(IsExtended ? BundledRecipes[2] : BundledRecipes[0]);
Mul->printFlags(O);
- bool IsExtended = BundledRecipes.size() > 2;
if (IsExtended)
O << "(";
getOperand(0)->printAsOperand(O, SlotTracker);
if (IsExtended) {
- auto *Ext0 = cast<VPWidenCastRecipe>(
- BundledRecipes.size() == 5 ? BundledRecipes[3] : BundledRecipes[0]);
+ auto *Ext0 = cast<VPWidenCastRecipe>(BundledRecipes[0]);
O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
<< *Ext0->getResultType() << "), (";
} else {
@@ -2672,8 +2660,7 @@ void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
}
getOperand(1)->printAsOperand(O, SlotTracker);
if (IsExtended) {
- auto *Ext1 = cast<VPWidenCastRecipe>(
- BundledRecipes.size() == 5 ? BundledRecipes[3] : BundledRecipes[1]);
+ auto *Ext1 = cast<VPWidenCastRecipe>(BundledRecipes[1]);
O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
<< *Ext1->getResultType() << ")";
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 13273fbd9d245..dbeaf3dcf8bba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1776,9 +1776,9 @@ void VPlanTransforms::truncateToMinimalBitwidths(
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (!isa<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
- VPReplicateRecipe, VPWidenSelectRecipe, VPWidenLoadRecipe,
- VPWidenIntrinsicRecipe>(&R))
+ if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
+ VPWidenSelectRecipe, VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(
+ &R))
continue;
VPValue *ResultVPV = R.getVPSingleValue();
@@ -2942,7 +2942,22 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
IsMulAccValidAndClampRange(Ext0->getOpcode() ==
Instruction::CastOps::ZExt,
Mul, Ext0, Ext1, Ext)) {
- return new VPBundleRecipe(Ext0, Ext1, Mul, Ext, Red);
+ auto *NewExt0 = new VPWidenCastRecipe(
+ Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
+ Ext0->getDebugLoc());
+ NewExt0->insertBefore(Ext0);
+
+ VPWidenCastRecipe *NewExt1 = NewExt0;
+ if (Ext0 != Ext1) {
+ NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
+ Ext->getResultType(), *Ext1,
+ Ext1->getDebugLoc());
+ NewExt1->insertBefore(Ext1);
+ }
+ Mul->setOperand(0, NewExt0);
+ Mul->setOperand(1, NewExt1);
+ Red->setOperand(1, Mul);
+ return new VPBundleRecipe(NewExt0, NewExt1, Mul, Red);
}
}
return nullptr;
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
index e8af144498659..ddd334d2982f8 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -1525,9 +1525,8 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP4]]
+; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i64> [[TMP4]], [[TMP4]]
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
; CHECK-NEXT: [[TMP5]] = add i64 [[TMP3]], [[VEC_PHI]]
; CHECK-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
>From 1d717d2e1185de00676475d4fa8dc0e2e7fae5c6 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 26 Jun 2025 15:11:23 +0100
Subject: [PATCH 7/7] !fixup adjust naming
---
llvm/lib/Transforms/Vectorize/VPlan.h | 42 ++++++++++---------
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 28 ++++++-------
.../Transforms/Vectorize/VPlanTransforms.cpp | 25 +++++------
3 files changed, 49 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 7ef42b20f2e25..c6e4214faaa95 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2713,9 +2713,9 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
/// graph of other, non-bundled recipes. Def-use edges between pairs of bundled
/// recipes remain intact, whereas every edge between a bundled and a
/// non-bundled recipe is elevated to connect the non-bundled recipe with the
-/// VPExpression itself.
-class VPExpression : public VPSingleDefRecipe {
- /// Recipes bundled together in this VPExpression.
+/// VPSingleDefBundleRecipe itself.
+class VPSingleDefBundleRecipe : public VPSingleDefRecipe {
+ /// Recipes bundled together in this VPSingleDefBundleRecipe.
SmallVector<VPSingleDefRecipe *> BundledRecipes;
/// Temporary VPValues used for external operands of the bundle, i.e. operands
@@ -2740,25 +2740,27 @@ class VPExpression : public VPSingleDefRecipe {
/// Type of the bundle.
BundleTypes BundleType;
- /// Construct a new VPExpression by internalizing recipes in \p
+ /// Construct a new VPSingleDefBundleRecipe by internalizing recipes in \p
/// BundledRecipes. External operands (i.e. not defined by another recipe in
/// the bundle) are replaced by temporary VPValues and the original operands
- /// are transferred to the VPExpression itself. Clone recipes as needed
- /// (excluding last) to ensure they are only used by other recipes in the
- /// bundle.
- VPExpression(BundleTypes BundleType, ArrayRef<VPSingleDefRecipe *> ToBundle);
+ /// are transferred to the VPSingleDefBundleRecipe itself. Clone recipes as
+ /// needed (excluding last) to ensure they are only used by other recipes in
+ /// the bundle.
+ VPSingleDefBundleRecipe(BundleTypes BundleType,
+ ArrayRef<VPSingleDefRecipe *> ToBundle);
public:
- VPExpression(VPWidenCastRecipe *Ext, VPReductionRecipe *Red)
- : VPExpression(BundleTypes::ExtendedReduction, {Ext, Red}) {}
- VPExpression(VPWidenRecipe *Mul, VPReductionRecipe *Red)
- : VPExpression(BundleTypes::MulAccumulateReduction, {Mul, Red}) {}
- VPExpression(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
- VPWidenRecipe *Mul, VPReductionRecipe *Red)
- : VPExpression(BundleTypes::ExtMulAccumulateReduction,
- {Ext0, Ext1, Mul, Red}) {}
-
- ~VPExpression() override {
+ VPSingleDefBundleRecipe(VPWidenCastRecipe *Ext, VPReductionRecipe *Red)
+ : VPSingleDefBundleRecipe(BundleTypes::ExtendedReduction, {Ext, Red}) {}
+ VPSingleDefBundleRecipe(VPWidenRecipe *Mul, VPReductionRecipe *Red)
+ : VPSingleDefBundleRecipe(BundleTypes::MulAccumulateReduction,
+ {Mul, Red}) {}
+ VPSingleDefBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+ VPWidenRecipe *Mul, VPReductionRecipe *Red)
+ : VPSingleDefBundleRecipe(BundleTypes::ExtMulAccumulateReduction,
+ {Ext0, Ext1, Mul, Red}) {}
+
+ ~VPSingleDefBundleRecipe() override {
SmallPtrSet<VPRecipeBase *, 4> Seen;
for (auto *R : reverse(BundledRecipes))
if (Seen.insert(R).second)
@@ -2769,7 +2771,7 @@ class VPExpression : public VPSingleDefRecipe {
VP_CLASSOF_IMPL(VPDef::VPBundleSC)
- VPExpression *clone() override {
+ VPSingleDefBundleRecipe *clone() override {
assert(!BundledRecipes.empty() && "empty bundles should be removed");
SmallVector<VPSingleDefRecipe *> NewBundledRecipes;
for (auto *R : BundledRecipes)
@@ -2783,7 +2785,7 @@ class VPExpression : public VPSingleDefRecipe {
zip(BundleLiveInPlaceholders, operands()))
New->replaceUsesOfWith(Placeholder, OutsideOp);
}
- return new VPExpression(BundleType, NewBundledRecipes);
+ return new VPSingleDefBundleRecipe(BundleType, NewBundledRecipes);
}
/// Return the VPSingleDefRecipe producing the final result of the bundled
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3c2761f566d6a..4aa7f6dce41eb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -50,7 +50,7 @@ using VectorParts = SmallVector<Value *, 2>;
bool VPRecipeBase::mayWriteToMemory() const {
switch (getVPDefID()) {
case VPBundleSC:
- return cast<VPBundleRecipe>(this)->mayReadOrWriteMemory();
+ return cast<VPSingleDefBundleRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
case VPInterleaveSC:
@@ -100,7 +100,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
bool VPRecipeBase::mayReadFromMemory() const {
switch (getVPDefID()) {
case VPBundleSC:
- return cast<VPBundleRecipe>(this)->mayReadOrWriteMemory();
+ return cast<VPSingleDefBundleRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
case VPWidenLoadEVLSC:
@@ -148,7 +148,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
bool VPRecipeBase::mayHaveSideEffects() const {
switch (getVPDefID()) {
case VPBundleSC:
- return cast<VPBundleRecipe>(this)->mayHaveSideEffects();
+ return cast<VPSingleDefBundleRecipe>(this)->mayHaveSideEffects();
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPPredInstPHISC:
@@ -2506,8 +2506,8 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
Ctx.CostKind);
}
-VPBundleRecipe::VPBundleRecipe(BundleTypes BundleType,
- ArrayRef<VPSingleDefRecipe *> ToBundle)
+VPSingleDefBundleRecipe::VPSingleDefBundleRecipe(
+ BundleTypes BundleType, ArrayRef<VPSingleDefRecipe *> ToBundle)
: VPSingleDefRecipe(VPDef::VPBundleSC, {}, {}),
BundledRecipes(
SetVector<VPSingleDefRecipe *>(ToBundle.begin(), ToBundle.end())
@@ -2544,7 +2544,7 @@ VPBundleRecipe::VPBundleRecipe(BundleTypes BundleType,
// Internalize all external operands to the bundled recipes. To do so,
// create new temporary VPValues for all operands defined by a recipe outside
// the bundle. The original operands are added as operands of the
- // VPBundleRecipe itself.
+ // VPSingleDefBundleRecipe itself.
for (auto *R : BundledRecipes) {
for (const auto &[Idx, Op] : enumerate(R->operands())) {
auto *Def = Op->getDefiningRecipe();
@@ -2557,7 +2557,7 @@ VPBundleRecipe::VPBundleRecipe(BundleTypes BundleType,
}
}
-void VPBundleRecipe::unbundle() {
+void VPSingleDefBundleRecipe::unbundle() {
for (auto *R : BundledRecipes)
R->insertBefore(this);
@@ -2568,13 +2568,13 @@ void VPBundleRecipe::unbundle() {
BundledRecipes.clear();
}
-InstructionCost VPBundleRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
+InstructionCost VPSingleDefBundleRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
Type *RedTy = Ctx.Types.inferScalarType(this);
auto *SrcVecTy = cast<VectorType>(
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
assert(RedTy->isIntegerTy() &&
- "VPBundleRecipe only supports integer types currently.");
+ "VPSingleDefBundleRecipe only supports integer types currently.");
switch (BundleType) {
case BundleTypes::ExtendedReduction: {
unsigned Opcode = RecurrenceDescriptor::getOpcode(
@@ -2596,21 +2596,21 @@ InstructionCost VPBundleRecipe::computeCost(ElementCount VF,
}
}
-bool VPBundleRecipe::mayReadOrWriteMemory() const {
+bool VPSingleDefBundleRecipe::mayReadOrWriteMemory() const {
return any_of(BundledRecipes, [](VPSingleDefRecipe *R) {
return R->mayReadFromMemory() || R->mayWriteToMemory();
});
}
-bool VPBundleRecipe::mayHaveSideEffects() const {
+bool VPSingleDefBundleRecipe::mayHaveSideEffects() const {
return any_of(BundledRecipes,
[](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); });
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+void VPSingleDefBundleRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << Indent << "BUNDLE ";
printAsOperand(O, SlotTracker);
O << " = ";
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index dbeaf3dcf8bba..7113a3e133156 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2710,7 +2710,7 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
ToRemove.push_back(VPI);
}
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (auto *Bundle = dyn_cast<VPBundleRecipe>(&R)) {
+ if (auto *Bundle = dyn_cast<VPSingleDefBundleRecipe>(&R)) {
Bundle->unbundle();
Bundle->eraseFromParent();
}
@@ -2813,10 +2813,10 @@ void VPlanTransforms::handleUncountableEarlyExit(
}
/// This function tries convert extended in-loop reductions to
-/// VPBundleRecipe and clamp the \p Range if it is beneficial and
+/// VPSingleDefBundleRecipe and clamp the \p Range if it is beneficial and
/// valid. The created recipe must be unbundled to its constituent
/// recipes before execution.
-static VPBundleRecipe *
+static VPSingleDefBundleRecipe *
tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
VFRange &Range) {
using namespace VPlanPatternMatch;
@@ -2850,19 +2850,20 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
Instruction::CastOps::ZExt,
Ctx.Types.inferScalarType(A)))
- return new VPBundleRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
+ return new VPSingleDefBundleRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
return nullptr;
}
/// This function tries convert extended in-loop reductions to
-/// VPBundleRecipe and clamp the \p Range if it is beneficial
-/// and valid. The created VPBundleRecipe must be unbundled to its constituent
-/// recipes before execution. Patterns of the VPBundleRecipe:
+/// VPSingleDefBundleRecipe and clamp the \p Range if it is beneficial
+/// and valid. The created VPSingleDefBundleRecipe must be unbundled to its
+/// constituent recipes before execution. Patterns of the
+/// VPSingleDefBundleRecipe:
/// reduce.add(mul(...)),
/// reduce.add(mul(ext(A), ext(B))),
/// reduce.add(ext(mul(ext(A), ext(B)))).
-static VPBundleRecipe *
+static VPSingleDefBundleRecipe *
tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
VPCostContext &Ctx, VFRange &Range) {
using namespace VPlanPatternMatch;
@@ -2919,11 +2920,11 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
Instruction::CastOps::ZExt,
Mul, RecipeA, RecipeB, nullptr)) {
- return new VPBundleRecipe(RecipeA, RecipeB, Mul, Red);
+ return new VPSingleDefBundleRecipe(RecipeA, RecipeB, Mul, Red);
}
// Match reduce.add(mul).
if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) {
- return new VPBundleRecipe(Mul, Red);
+ return new VPSingleDefBundleRecipe(Mul, Red);
}
}
// Match reduce.add(ext(mul(ext(A), ext(B)))).
@@ -2957,7 +2958,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
Mul->setOperand(0, NewExt0);
Mul->setOperand(1, NewExt1);
Red->setOperand(1, Mul);
- return new VPBundleRecipe(NewExt0, NewExt1, Mul, Red);
+ return new VPSingleDefBundleRecipe(NewExt0, NewExt1, Mul, Red);
}
}
return nullptr;
@@ -2968,7 +2969,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
VPCostContext &Ctx,
VFRange &Range) {
- VPBundleRecipe *AbstractR = nullptr;
+ VPSingleDefBundleRecipe *AbstractR = nullptr;
auto IP = std::next(Red->getIterator());
auto *VPBB = Red->getParent();
if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
More information about the llvm-commits
mailing list