[llvm] [VPlan] Implement VPExtendedReduction, VPMulAccumulateReductionRecipe and corresponding vplan transformations. (PR #137746)
Elvis Wang via llvm-commits
llvm-commits at lists.llvm.org
Thu May 15 00:24:52 PDT 2025
https://github.com/ElvisWang123 updated https://github.com/llvm/llvm-project/pull/137746
>From 3769e1f169796f9ec591a29145ea9eb7608fe576 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 27 Apr 2025 23:32:25 -0700
Subject: [PATCH 1/9] [VPlan] Implement transformation for widen-cast/widen-mul
+ reduction to abstract recipe.
This patch introduce two new recipes.
* VPExtendedReductionRecipe
- cast + reduction.
* VPMulAccumulateReductionRecipe
- (cast) + mul + reduction.
This patch also implements the transformation that match following
patterns via vplan and converts to abstract recipes for better cost
estimation.
* VPExtendedReduction
- reduce(cast(...))
* VPMulAccumulateReductionRecipe
- reduce.add(mul(...))
- reduce.add(mul(ext(...), ext(...))
- reduce.add(ext(mul(ext(...), ext(...))))
The conveted abstract recipes will be lower to the concrete recipes
(widen-cast + widen-mul + reduction) just before recipe execution.
Split from #113903.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 19 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 259 +++++++++++++++++-
.../Transforms/Vectorize/VPlanAnalysis.cpp | 2 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 101 ++++++-
.../Transforms/Vectorize/VPlanTransforms.cpp | 247 +++++++++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 7 +
llvm/lib/Transforms/Vectorize/VPlanValue.h | 2 +
.../LoopVectorize/ARM/mve-reduction-types.ll | 4 +-
.../LoopVectorize/ARM/mve-reductions.ll | 120 ++++----
.../LoopVectorize/reduction-inloop-pred.ll | 2 +-
.../LoopVectorize/reduction-inloop.ll | 8 +-
.../vplan-printing-reductions.ll | 145 ++++++++++
12 files changed, 838 insertions(+), 78 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1611c6d3a4437..9af481d9a172f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9568,10 +9568,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
"entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock");
- for (ElementCount VF : Range)
- Plan->addVF(VF);
- Plan->setName("Initial VPlan");
-
// Update wide induction increments to use the same step as the corresponding
// wide induction. This enables detecting induction increments directly in
// VPlan and removes redundant splats.
@@ -9601,6 +9597,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
+ // Transform recipes to abstract recipes if it is legal and beneficial and
+ // clamp the range for better cost estimation.
+ // TODO: Enable following transform when the EVL-version of extended-reduction
+ // and mulacc-reduction are implemented.
+ if (!CM.foldTailWithEVL()) {
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+ CM.CostKind);
+ VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
+ CostCtx, Range);
+ }
+
+ for (ElementCount VF : Range)
+ Plan->addVF(VF);
+ Plan->setName("Initial VPlan");
+
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 2c4cac7655ec9..598413d7ddb74 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -517,6 +517,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
+ case VPRecipeBase::VPMulAccumulateReductionSC:
+ case VPRecipeBase::VPExtendedReductionSC:
case VPRecipeBase::VPReplicateSC:
case VPRecipeBase::VPScalarIVStepsSC:
case VPRecipeBase::VPVectorPointerSC:
@@ -601,13 +603,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
};
+ struct NonNegFlagsTy {
+ char NonNeg : 1;
+ NonNegFlagsTy(bool IsNonNeg) : NonNeg(IsNonNeg) {}
+ };
+
private:
struct ExactFlagsTy {
char IsExact : 1;
};
- struct NonNegFlagsTy {
- char NonNeg : 1;
- };
struct FastMathFlagsTy {
char AllowReassoc : 1;
char NoNaNs : 1;
@@ -697,6 +701,12 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
: VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp),
DisjointFlags(DisjointFlags) {}
+ template <typename IterT>
+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+ NonNegFlagsTy NonNegFlags, DebugLoc DL = {})
+ : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::NonNegOp),
+ NonNegFlags(NonNegFlags) {}
+
protected:
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
GEPNoWrapFlags GEPFlags, DebugLoc DL = {})
@@ -715,7 +725,9 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
- R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
+ R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
+ R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
+ R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
}
static inline bool classof(const VPUser *U) {
@@ -812,6 +824,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
FastMathFlags getFastMathFlags() const;
+ /// Returns true if the recipe has non-negative flag.
+ bool hasNonNegFlag() const { return OpType == OperationType::NonNegOp; }
+
+ bool isNonNeg() const {
+ assert(OpType == OperationType::NonNegOp &&
+ "recipe doesn't have a NNEG flag");
+ return NonNegFlags.NonNeg;
+ }
+
bool hasNoUnsignedWrap() const {
assert(OpType == OperationType::OverflowingBinOp &&
"recipe doesn't have a NUW flag");
@@ -1289,10 +1310,21 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
: VPRecipeWithIRFlags(VPDefOpcode, Operands, I), VPIRMetadata(I),
Opcode(I.getOpcode()) {}
+ template <typename IterT>
+ VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode,
+ iterator_range<IterT> Operands, bool NUW, bool NSW, DebugLoc DL)
+ : VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL),
+ Opcode(Opcode) {}
+
public:
VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands)
: VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
+ template <typename IterT>
+ VPWidenRecipe(unsigned Opcode, iterator_range<IterT> Operands, bool NUW,
+ bool NSW, DebugLoc DL)
+ : VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {}
+
~VPWidenRecipe() override = default;
VPWidenRecipe *clone() override {
@@ -1337,8 +1369,15 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
"opcode of underlying cast doesn't match");
}
- VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
- : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), VPIRMetadata(),
+ VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
+ DebugLoc DL = {})
+ : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, DL), VPIRMetadata(),
+ Opcode(Opcode), ResultTy(ResultTy) {}
+
+ VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
+ bool IsNonNeg, DebugLoc DL = {})
+ : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, NonNegFlagsTy(IsNonNeg),
+ DL),
Opcode(Opcode), ResultTy(ResultTy) {}
~VPWidenCastRecipe() override = default;
@@ -2381,6 +2420,28 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
setUnderlyingValue(I);
}
+ /// For VPExtendedReductionRecipe.
+ /// Note that the debug location is from the extend.
+ VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
+ ArrayRef<VPValue *> Operands, VPValue *CondOp,
+ bool IsOrdered, DebugLoc DL)
+ : VPRecipeWithIRFlags(SC, Operands, DL), RdxKind(RdxKind),
+ IsOrdered(IsOrdered), IsConditional(CondOp) {
+ if (CondOp)
+ addOperand(CondOp);
+ }
+
+ /// For VPMulAccumulateReductionRecipe.
+ /// Note that the NUW/NSW flags and the debug location are from the Mul.
+ VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
+ ArrayRef<VPValue *> Operands, VPValue *CondOp,
+ bool IsOrdered, WrapFlagsTy WrapFlags, DebugLoc DL)
+ : VPRecipeWithIRFlags(SC, Operands, WrapFlags, DL), RdxKind(RdxKind),
+ IsOrdered(IsOrdered), IsConditional(CondOp) {
+ if (CondOp)
+ addOperand(CondOp);
+ }
+
public:
VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
@@ -2389,6 +2450,13 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
IsOrdered, DL) {}
+ VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs,
+ VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
+ bool IsOrdered, DebugLoc DL = {})
+ : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr,
+ ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
+ IsOrdered, DL) {}
+
~VPReductionRecipe() override = default;
VPReductionRecipe *clone() override {
@@ -2399,7 +2467,9 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
- R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
+ R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
+ R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
+ R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
}
static inline bool classof(const VPUser *U) {
@@ -2538,6 +2608,181 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
}
};
+/// A recipe to represent inloop extended reduction operations, performing a
+/// reduction on a extended vector operand into a scalar value, and adding the
+/// result to a chain. This recipe is abstract and needs to be lowered to
+/// concrete recipes before codegen. The operands are {ChainOp, VecOp,
+/// [Condition]}.
+class VPExtendedReductionRecipe : public VPReductionRecipe {
+ /// Opcode of the extend recipe will be lowered to.
+ Instruction::CastOps ExtOp;
+
+ Type *ResultTy;
+
+ /// For cloning VPExtendedReductionRecipe.
+ VPExtendedReductionRecipe(VPExtendedReductionRecipe *ExtRed)
+ : VPReductionRecipe(
+ VPDef::VPExtendedReductionSC, ExtRed->getRecurrenceKind(),
+ {ExtRed->getChainOp(), ExtRed->getVecOp()}, ExtRed->getCondOp(),
+ ExtRed->isOrdered(), ExtRed->getDebugLoc()),
+ ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) {
+ transferFlags(*ExtRed);
+ }
+
+public:
+ VPExtendedReductionRecipe(VPReductionRecipe *R, VPWidenCastRecipe *Ext)
+ : VPReductionRecipe(VPDef::VPExtendedReductionSC, R->getRecurrenceKind(),
+ {R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(),
+ R->isOrdered(), Ext->getDebugLoc()),
+ ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) {
+ // Not all WidenCastRecipes contain nneg flag. Need to transfer flags from
+ // the original recipe to prevent setting wrong flags.
+ transferFlags(*Ext);
+ }
+
+ ~VPExtendedReductionRecipe() override = default;
+
+ VPExtendedReductionRecipe *clone() override {
+ auto *Copy = new VPExtendedReductionRecipe(this);
+ Copy->transferFlags(*this);
+ return Copy;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC);
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPExtendedReductionRecipe should be transform to "
+ "VPExtendedRecipe + VPReductionRecipe before execution.");
+ };
+
+ /// Return the cost of VPExtendedReductionRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// The scalar type after extending.
+ Type *getResultType() const { return ResultTy; }
+
+ /// Is the extend ZExt?
+ bool isZExt() const { return getExtOpcode() == Instruction::ZExt; }
+
+ /// The opcode of extend recipe.
+ Instruction::CastOps getExtOpcode() const { return ExtOp; }
+};
+
+/// A recipe to represent inloop MulAccumulateReduction operations, performing a
+/// reduction.add on the result of vector operands (might be extended)
+/// multiplication into a scalar value, and adding the result to a chain. This
+/// recipe is abstract and needs to be lowered to concrete recipes before
+/// codegen. The operands are {ChainOp, VecOp1, VecOp2, [Condition]}.
+class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
+ /// Opcode of the extend recipe.
+ Instruction::CastOps ExtOp;
+
+ /// Non-neg flag of the extend recipe.
+ bool IsNonNeg = false;
+
+ Type *ResultTy;
+
+ /// For cloning VPMulAccumulateReductionRecipe.
+ VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc)
+ : VPReductionRecipe(
+ VPDef::VPMulAccumulateReductionSC, MulAcc->getRecurrenceKind(),
+ {MulAcc->getChainOp(), MulAcc->getVecOp0(), MulAcc->getVecOp1()},
+ MulAcc->getCondOp(), MulAcc->isOrdered(),
+ WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()),
+ MulAcc->getDebugLoc()),
+ ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()),
+ ResultTy(MulAcc->getResultType()) {}
+
+public:
+ VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
+ VPWidenCastRecipe *Ext0,
+ VPWidenCastRecipe *Ext1, Type *ResultTy)
+ : VPReductionRecipe(
+ VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
+ {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)},
+ R->getCondOp(), R->isOrdered(),
+ WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
+ R->getDebugLoc()),
+ ExtOp(Ext0->getOpcode()), ResultTy(ResultTy) {
+ assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
+ Instruction::Add &&
+ "The reduction instruction in MulAccumulateteReductionRecipe must "
+ "be Add");
+ // Only set the non-negative flag if the original recipe contains.
+ if (Ext0->hasNonNegFlag())
+ IsNonNeg = Ext0->isNonNeg();
+ }
+
+ VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul)
+ : VPReductionRecipe(
+ VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
+ {R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)},
+ R->getCondOp(), R->isOrdered(),
+ WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
+ R->getDebugLoc()),
+ ExtOp(Instruction::CastOps::CastOpsEnd) {
+ assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
+ Instruction::Add &&
+ "The reduction instruction in MulAccumulateReductionRecipe must be "
+ "Add");
+ }
+
+ ~VPMulAccumulateReductionRecipe() override = default;
+
+ VPMulAccumulateReductionRecipe *clone() override {
+ auto *Copy = new VPMulAccumulateReductionRecipe(this);
+ Copy->transferFlags(*this);
+ return Copy;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPMulAccumulateReductionSC);
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPMulAccumulateReductionRecipe should transform to "
+ "VPWidenCastRecipe + "
+ "VPWidenRecipe + VPReductionRecipe before execution");
+ }
+
+ /// Return the cost of VPMulAccumulateReductionRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ Type *getResultType() const {
+ assert(isExtended() && "Only support getResultType when this recipe "
+ "contains implicit extend.");
+ return ResultTy;
+ }
+
+ /// The VPValue of the vector value to be extended and reduced.
+ VPValue *getVecOp0() const { return getOperand(1); }
+ VPValue *getVecOp1() const { return getOperand(2); }
+
+ /// Return if this MulAcc recipe contains extended operands.
+ bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; }
+
+ /// Return the opcode of the extends for the operands.
+ Instruction::CastOps getExtOpcode() const { return ExtOp; }
+
+ /// Return if the operands are zero extended.
+ bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; }
+
+ /// Return the non negative flag of the ext recipe.
+ bool isNonNeg() const { return IsNonNeg; }
+};
+
/// VPReplicateRecipe replicates a given instruction producing multiple scalar
/// copies of the original scalar type, one per lane, instead of producing a
/// single copy of widened type for all lanes. If the instruction is known to be
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index c86815c84d8d9..7dcbd72c25191 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -273,6 +273,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
// TODO: Use info from interleave group.
return V->getUnderlyingValue()->getType();
})
+ .Case<VPExtendedReductionRecipe, VPMulAccumulateReductionRecipe>(
+ [](const auto *R) { return R->getResultType(); })
.Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
return R->getSCEV()->getType();
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3c7ab7d24bf6d..04eab649713bc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -73,6 +73,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
+ case VPExtendedReductionSC:
+ case VPMulAccumulateReductionSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -120,6 +122,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
+ case VPExtendedReductionSC:
+ case VPMulAccumulateReductionSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -157,6 +161,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
+ case VPExtendedReductionSC:
+ case VPMulAccumulateReductionSC:
case VPScalarIVStepsSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
@@ -2521,28 +2527,49 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
FastMathFlags FMFs = getFastMathFlags();
+ std::optional<FastMathFlags> OptionalFMF =
+ ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
- // TODO: Support any-of and in-loop reductions.
+ // TODO: Support any-of reductions.
assert(
(!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
ForceTargetInstructionCost.getNumOccurrences() > 0) &&
"Any-of reduction not implemented in VPlan-based cost model currently.");
- assert(
- (!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() ||
- ForceTargetInstructionCost.getNumOccurrences() > 0) &&
- "In-loop reduction not implemented in VPlan-based cost model currently.");
- // Cost = Reduction cost + BinOp cost
- InstructionCost Cost =
- Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
- return Cost +
- Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
+ return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
}
- return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs,
- Ctx.CostKind);
+ return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
+ Ctx.CostKind);
+}
+
+InstructionCost
+VPExtendedReductionRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind());
+ Type *RedTy = Ctx.Types.inferScalarType(this);
+ auto *SrcVecTy =
+ cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF));
+ assert(RedTy->isIntegerTy() &&
+ "ExtendedReduction only support integer type currently.");
+ InstructionCost Cost = Ctx.TTI.getExtendedReductionCost(
+ Opcode, isZExt(), RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
+ // The cost of this recipe should be decided by the legacy model.
+ return Cost.isValid() ? 0 : Cost;
+}
+
+InstructionCost
+VPMulAccumulateReductionRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ Type *RedTy = Ctx.Types.inferScalarType(this);
+ auto *SrcVecTy =
+ cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF));
+ InstructionCost Cost =
+ Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy, Ctx.CostKind);
+ // The cost of this recipe should be decided by the legacy model.
+ return Cost.isValid() ? 0 : Cost;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2587,6 +2614,56 @@ void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
}
O << ")";
}
+
+void VPExtendedReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EXTENDED-REDUCE ";
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ getChainOp()->printAsOperand(O, SlotTracker);
+ O << " +";
+ O << " reduce."
+ << Instruction::getOpcodeName(
+ RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
+ << " (";
+ getVecOp()->printAsOperand(O, SlotTracker);
+ O << " extended to " << *getResultType();
+ if (isConditional()) {
+ O << ", ";
+ getCondOp()->printAsOperand(O, SlotTracker);
+ }
+ O << ")";
+}
+
+void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "MULACC-REDUCE ";
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ getChainOp()->printAsOperand(O, SlotTracker);
+ O << " + ";
+ O << "reduce."
+ << Instruction::getOpcodeName(
+ RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
+ << " (";
+ O << "mul";
+ printFlags(O);
+ if (isExtended())
+ O << "(";
+ getVecOp0()->printAsOperand(O, SlotTracker);
+ if (isExtended())
+ O << " extended to " << *getResultType() << "), (";
+ else
+ O << ", ";
+ getVecOp1()->printAsOperand(O, SlotTracker);
+ if (isExtended())
+ O << " extended to " << *getResultType() << ")";
+ if (isConditional()) {
+ O << ", ";
+ getCondOp()->printAsOperand(O, SlotTracker);
+ }
+ O << ")";
+}
#endif
/// A helper function to scalarize a single Instruction in the innermost loop.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 806c20ef8cf73..5e56104c875af 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2392,6 +2392,82 @@ void VPlanTransforms::createInterleaveGroups(
}
}
+// Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe.
+static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) {
+ VPWidenCastRecipe *Ext;
+ // Only ZExt contains non-neg flags.
+ if (ExtRed->isZExt())
+ Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(),
+ ExtRed->getResultType(), ExtRed->isNonNeg(),
+ ExtRed->getDebugLoc());
+ else
+ Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(),
+ ExtRed->getResultType(), ExtRed->getDebugLoc());
+
+ auto *Red = new VPReductionRecipe(
+ ExtRed->getRecurrenceKind(), FastMathFlags(), ExtRed->getChainOp(), Ext,
+ ExtRed->getCondOp(), ExtRed->isOrdered(), ExtRed->getDebugLoc());
+ Ext->insertBefore(ExtRed);
+ Red->insertBefore(ExtRed);
+ ExtRed->replaceAllUsesWith(Red);
+ ExtRed->eraseFromParent();
+}
+
+// Expand VPMulAccumulateReductionRecipe to VPWidenRecipe (mul) +
+// VPReductionRecipe (reduce.add)
+// + VPWidenCastRecipe (optional).
+static void
+expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) {
+ // Generate inner VPWidenCastRecipes if necessary.
+ // Note that we will drop the extend after mul which transform
+ // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
+ VPValue *Op0, *Op1;
+ if (MulAcc->isExtended()) {
+ Type *RedTy = MulAcc->getResultType();
+ if (MulAcc->isZExt())
+ Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(),
+ RedTy, MulAcc->isNonNeg(),
+ MulAcc->getDebugLoc());
+ else
+ Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(),
+ RedTy, MulAcc->getDebugLoc());
+ Op0->getDefiningRecipe()->insertBefore(MulAcc);
+ // Prevent reduce.add(mul(ext(A), ext(A))) generate duplicate
+ // VPWidenCastRecipe.
+ if (MulAcc->getVecOp0() == MulAcc->getVecOp1()) {
+ Op1 = Op0;
+ } else {
+ if (MulAcc->isZExt())
+ Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(),
+ RedTy, MulAcc->isNonNeg(),
+ MulAcc->getDebugLoc());
+ else
+ Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(),
+ RedTy, MulAcc->getDebugLoc());
+ Op1->getDefiningRecipe()->insertBefore(MulAcc);
+ }
+ } else {
+ // No extends in this MulAccRecipe.
+ Op0 = MulAcc->getVecOp0();
+ Op1 = MulAcc->getVecOp1();
+ }
+
+ std::array<VPValue *, 2> MulOps = {Op0, Op1};
+ auto *Mul = new VPWidenRecipe(
+ Instruction::Mul, make_range(MulOps.begin(), MulOps.end()),
+ MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap(),
+ MulAcc->getDebugLoc());
+ Mul->insertBefore(MulAcc);
+
+ auto *Red = new VPReductionRecipe(
+ MulAcc->getRecurrenceKind(), FastMathFlags(), MulAcc->getChainOp(), Mul,
+ MulAcc->getCondOp(), MulAcc->isOrdered(), MulAcc->getDebugLoc());
+ Red->insertBefore(MulAcc);
+
+ MulAcc->replaceAllUsesWith(Red);
+ MulAcc->eraseFromParent();
+}
+
void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
Type &CanonicalIVTy) {
using namespace llvm::VPlanPatternMatch;
@@ -2454,6 +2530,12 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
VPI->replaceAllUsesWith(VectorStep);
ToRemove.push_back(VPI);
}
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ if (auto *ExtRed = dyn_cast<VPExtendedReductionRecipe>(&R))
+ expandVPExtendedReduction(ExtRed);
+ if (auto *MulAcc = dyn_cast<VPMulAccumulateReductionRecipe>(&R))
+ expandVPMulAccumulateReduction(MulAcc);
+ }
}
for (VPRecipeBase *R : ToRemove)
@@ -2551,6 +2633,171 @@ void VPlanTransforms::handleUncountableEarlyExit(
LatchExitingBranch->eraseFromParent();
}
+/// This function tries convert extended in-loop reductions to
+/// VPExtendedReductionRecipe and clamp the \p Range if it is beneficial and
+/// valid. The created recipe must be lowered to concrete
+/// recipes before execution.
+static VPExtendedReductionRecipe *
+tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
+ VFRange &Range) {
+ using namespace VPlanPatternMatch;
+
+ Type *RedTy = Ctx.Types.inferScalarType(Red);
+ VPValue *VecOp = Red->getVecOp();
+
+ // Clamp the range if using extended-reduction is profitable.
+ auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
+ Type *SrcTy) -> bool {
+ return LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) {
+ auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
+ Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),
+ CostKind);
+ InstructionCost ExtCost =
+ cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
+ InstructionCost RedCost = Red->computeCost(VF, Ctx);
+ return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
+ },
+ Range);
+ };
+
+ VPValue *A;
+ // Match reduce(ext)).
+ if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
+ IsExtendedRedValidAndClampRange(
+ RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
+ cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
+ Instruction::CastOps::ZExt,
+ Ctx.Types.inferScalarType(A)))
+ return new VPExtendedReductionRecipe(Red, cast<VPWidenCastRecipe>(VecOp));
+
+ return nullptr;
+}
+
+/// This function tries convert extended in-loop reductions to
+/// VPMulAccumulateReductionRecipe and clamp the \p Range if it is beneficial
+/// and valid. The created VPExtendedReductionRecipe must be lower to concrete
+/// recipes before execution. Patterns of MulAccumulateReduction:
+/// reduce.add(mul(...)),
+/// reduce.add(mul(ext(A), ext(B))),
+/// reduce.add(ext(mul(ext(A), ext(B)))).
+static VPMulAccumulateReductionRecipe *
+tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
+ VPCostContext &Ctx, VFRange &Range) {
+ using namespace VPlanPatternMatch;
+
+ Type *RedTy = Ctx.Types.inferScalarType(Red);
+
+ // Clamp the range if using multiply-accumulate-reduction is profitable.
+ auto IsMulAccValidAndClampRange =
+ [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
+ VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
+ return LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) {
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ Type *SrcTy =
+ Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
+ auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
+ InstructionCost MulAccCost =
+ Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind);
+ InstructionCost MulCost = Mul->computeCost(VF, Ctx);
+ InstructionCost RedCost = Red->computeCost(VF, Ctx);
+ InstructionCost ExtCost = 0;
+ if (Ext0)
+ ExtCost += Ext0->computeCost(VF, Ctx);
+ if (Ext1)
+ ExtCost += Ext1->computeCost(VF, Ctx);
+ if (OuterExt)
+ ExtCost += OuterExt->computeCost(VF, Ctx);
+
+ return MulAccCost.isValid() &&
+ MulAccCost < ExtCost + MulCost + RedCost;
+ },
+ Range);
+ };
+
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
+ if (Opcode != Instruction::Add)
+ return nullptr;
+
+ VPValue *VecOp = Red->getVecOp();
+ VPValue *A, *B;
+ // Try to match reduce.add(mul(...))
+ if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
+ auto *RecipeA =
+ dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
+ auto *RecipeB =
+ dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
+ auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
+
+ // Match reduce.add(mul(ext, ext))
+ if (RecipeA && RecipeB &&
+ (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
+ match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
+ match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
+ IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
+ Instruction::CastOps::ZExt,
+ Mul, RecipeA, RecipeB, nullptr))
+ return new VPMulAccumulateReductionRecipe(Red, Mul, RecipeA, RecipeB,
+ RecipeA->getResultType());
+ // Match reduce.add(mul)
+ if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
+ return new VPMulAccumulateReductionRecipe(Red, Mul);
+ }
+ // Match reduce.add(ext(mul(ext(A), ext(B))))
+ // All extend recipes must have same opcode or A == B
+ // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
+ if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
+ m_ZExtOrSExt(m_VPValue()))))) {
+ auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
+ auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
+ auto *Ext0 =
+ cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
+ auto *Ext1 =
+ cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
+ if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
+ Ext0->getOpcode() == Ext1->getOpcode() &&
+ IsMulAccValidAndClampRange(Ext0->getOpcode() ==
+ Instruction::CastOps::ZExt,
+ Mul, Ext0, Ext1, Ext))
+ return new VPMulAccumulateReductionRecipe(Red, Mul, Ext0, Ext1,
+ Ext->getResultType());
+ }
+ return nullptr;
+}
+
+/// This function tries to create abstract recipes from the reduction recipe for
+/// following optimizations and cost estimation.
+static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
+ VPCostContext &Ctx,
+ VFRange &Range) {
+ VPReductionRecipe *AbstractR = nullptr;
+
+ if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
+ AbstractR = MulAcc;
+ else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
+ AbstractR = ExtRed;
+ // Cannot create abstract inloop reduction recipes.
+ if (!AbstractR)
+ return;
+
+ AbstractR->insertBefore(Red);
+ Red->replaceAllUsesWith(AbstractR);
+}
+
+void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
+ VFRange &Range) {
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
+ for (VPRecipeBase &R : *VPBB) {
+ if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
+ tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
+ }
+ }
+}
+
void VPlanTransforms::materializeStepVectors(VPlan &Plan) {
for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
auto *IVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index d284d916633c8..3a1ed7406b383 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -190,6 +190,13 @@ struct VPlanTransforms {
/// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);
+ /// This function converts initial recipes to the abstract recipes and clamps
+ /// \p Range based on cost model for following optimizations and cost
+ /// estimations. The converted abstract recipes will lower to concrete
+ /// recipes before codegen.
+ static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
+ VFRange &Range);
+
/// Perform instcombine-like simplifications on recipes in \p Plan. Use \p
/// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 638156eab7a84..64065edd315f9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -339,6 +339,8 @@ class VPDef {
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
+ VPMulAccumulateReductionSC,
+ VPExtendedReductionSC,
VPPartialReductionSC,
VPReplicateSC,
VPScalarIVStepsSC,
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
index 2078a10d04ce7..ce3b2a9f216f2 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
@@ -23,11 +23,11 @@ define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture rea
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
@@ -105,11 +105,11 @@ define i32 @mla_i8(ptr noalias nocapture readonly %A, ptr noalias nocapture read
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
index a11cc15a8a85b..d021306b89aab 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -646,12 +646,11 @@ define i64 @mla_i16_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i3
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64>
+; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <8 x i64> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -726,12 +725,11 @@ define i64 @mla_i8_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i32
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
-; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <8 x i32> [[TMP3]], [[TMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = zext nneg <8 x i32> [[TMP4]] to <8 x i64>
+; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-NEXT: [[TMP5:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -855,10 +853,10 @@ define i32 @mla_i16_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i3
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
@@ -910,10 +908,10 @@ define i32 @mla_i8_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i32
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <16 x i32> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
@@ -1016,10 +1014,10 @@ define signext i16 @mla_i8_i16(ptr nocapture readonly %x, ptr nocapture readonly
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16>
+; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw <16 x i16> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP4]], <16 x i16> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP5]])
@@ -1122,10 +1120,10 @@ define i32 @red_mla_ext_s8_s16_s32(ptr noalias nocapture readonly %A, ptr noalia
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP0]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
@@ -1459,9 +1457,8 @@ define i64 @mla_xx_sext_zext(ptr nocapture noundef readonly %x, i32 %n) #0 {
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -1528,11 +1525,11 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <8 x i64> [[TMP2]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI1]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -1667,24 +1664,55 @@ define i64 @test_std_q31(ptr %x, i32 %n) #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP11]])
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[WIDE_LOAD]], splat (i32 8)
+; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
+; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64>
+; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP5]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
+; CHECK-NEXT: [[TMP8]] = add i64 [[TMP7]], [[VEC_PHI1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP37:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD3:%.*]], [[ADD:%.*]]
+; CHECK-NEXT: [[ADD:%.*]] = phi i64 [ [[ADD1:%.*]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[ADD3:%.*]] = phi i64 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD3]], [[ADD]]
; CHECK-NEXT: ret i64 [[DIV]]
; CHECK: for.body:
-; CHECK-NEXT: [[S_014:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY1]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY1]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[T_012:%.*]] = phi i64 [ [[ADD3]], [[FOR_BODY1]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X:%.*]], i32 [[I_013]]
+; CHECK-NEXT: [[S_014:%.*]] = phi i64 [ [[ADD1]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[T_012:%.*]] = phi i64 [ [[ADD5]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i32 [[I_013]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP0]], 8
; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[SHR]] to i64
-; CHECK-NEXT: [[ADD]] = add nsw i64 [[S_014]], [[CONV]]
+; CHECK-NEXT: [[ADD1]] = add nsw i64 [[S_014]], [[CONV]]
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV]]
-; CHECK-NEXT: [[ADD3]] = add nuw nsw i64 [[MUL]], [[T_012]]
+; CHECK-NEXT: [[ADD5]] = add nuw nsw i64 [[MUL]], [[T_012]]
; CHECK-NEXT: [[ADD4]] = add nuw nsw i32 [[I_013]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[ADD4]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY1]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
;
entry:
%cmp11 = icmp sgt i32 %n, 0
@@ -1720,10 +1748,10 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[N]], -1
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 7
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 15
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -4
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -8
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC]], 1
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
@@ -1731,28 +1759,26 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 {
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
-; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i32> [[TMP7]] to <4 x i64>
-; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32>
-; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
-; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP13]]
-; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i32> [[TMP11]] to <4 x i64>
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP8]])
+; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2
+; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[STRIDED_VEC]] to <8 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <8 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP12]])
+; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = mul nsw <8 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP13]])
; CHECK-NEXT: [[TMP16]] = add i64 [[TMP15]], [[TMP10]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
@@ -1787,7 +1813,7 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 {
; CHECK-NEXT: [[ADD12]] = add nsw i64 [[ADD]], [[CONV11]]
; CHECK-NEXT: [[ADD13]] = add nuw nsw i32 [[I_025]], 2
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[ADD13]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP38:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP40:![0-9]+]]
;
entry:
%cmp23 = icmp sgt i32 %n, 0
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index 4c7a74ed05b58..17e3bb3cce7eb 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -476,10 +476,10 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK: pred.load.continue8:
; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ]
; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ]
-; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP38]]
; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP41]])
; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP42]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP38]]
; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP44]])
; CHECK-NEXT: [[TMP46]] = add i32 [[TMP45]], [[TMP43]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index 0ad1581f0a4a1..9ca7a84b3ea1c 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -225,9 +225,9 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND]])
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
; CHECK-NEXT: [[TMP6]] = add i32 [[TMP5]], [[TMP4]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -1289,15 +1289,13 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = udiv <4 x i8> [[WIDE_LOAD]], splat (i8 31)
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw <4 x i8> [[TMP2]], splat (i8 3)
; CHECK-NEXT: [[TMP4:%.*]] = udiv <4 x i8> [[TMP3]], splat (i8 31)
; CHECK-NEXT: [[TMP5:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]]
+; CHECK-NEXT: [[TMP9:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[TMP8]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 2cf630de208c9..cf920c91913fb 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -268,3 +268,148 @@ loop:
exit:
ret i64 %cond
}
+
+define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: 'print_extended_reduction'
+; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]>
+; CHECK-NEXT: EXTENDED-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> extended to i64)
+; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+ %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, ptr %x, i32 %iv
+ %load0 = load i32, ptr %arrayidx, align 4
+ %conv0 = zext i32 %load0 to i64
+ %rdx.next = add nsw i64 %rdx, %conv0
+ %iv.next = add nuw nsw i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
+ ret i64 %r.0.lcssa
+}
+
+define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: 'print_mulacc'
+; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
+; CHECK-NEXT: WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]>
+; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
+; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
+; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>)
+; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+ %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i64, ptr %x, i32 %iv
+ %load0 = load i64, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i64, ptr %y, i32 %iv
+ %load1 = load i64, ptr %arrayidx1, align 4
+ %mul = mul nsw i64 %load0, %load1
+ %rdx.next = add nsw i64 %rdx, %mul
+ %iv.next = add nuw nsw i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
+ ret i64 %r.0.lcssa
+}
+
+define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: 'print_mulacc_extended'
+; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
+; CHECK-NEXT: WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]>
+; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
+; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
+; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> extended to i64), (ir<[[LOAD1]]> extended to i64))
+; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+ %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i16, ptr %x, i32 %iv
+ %load0 = load i16, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i16, ptr %y, i32 %iv
+ %load1 = load i16, ptr %arrayidx1, align 4
+ %conv0 = sext i16 %load0 to i32
+ %conv1 = sext i16 %load1 to i32
+ %mul = mul nsw i32 %conv0, %conv1
+ %conv = sext i32 %mul to i64
+ %rdx.next = add nsw i64 %rdx, %conv
+ %iv.next = add nuw nsw i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
+ ret i64 %r.0.lcssa
+}
>From a4077bce3c5dc2e692478c6483381c3d0c665f66 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 4 May 2025 16:16:25 -0700
Subject: [PATCH 2/9] Fixup, Address comments.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 598413d7ddb74..b3566f8ea4976 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2635,6 +2635,10 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
{R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(),
R->isOrdered(), Ext->getDebugLoc()),
ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) {
+ assert((ExtOp == Instruction::CastOps::ZExt ||
+ ExtOp == Instruction::CastOps::SExt) &&
+ "VPExtendedReductionRecipe only support zext and sext.");
+
// Not all WidenCastRecipes contain nneg flag. Need to transfer flags from
// the original recipe to prevent setting wrong flags.
transferFlags(*Ext);
@@ -2643,9 +2647,7 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
~VPExtendedReductionRecipe() override = default;
VPExtendedReductionRecipe *clone() override {
- auto *Copy = new VPExtendedReductionRecipe(this);
- Copy->transferFlags(*this);
- return Copy;
+ return new VPExtendedReductionRecipe(this);
}
VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC);
@@ -2715,6 +2717,9 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
Instruction::Add &&
"The reduction instruction in MulAccumulateteReductionRecipe must "
"be Add");
+ assert((ExtOp == Instruction::CastOps::ZExt ||
+ ExtOp == Instruction::CastOps::SExt) &&
+ "VPMulAccumulateReductionRecipe only support zext and sext.");
// Only set the non-negative flag if the original recipe contains.
if (Ext0->hasNonNegFlag())
IsNonNeg = Ext0->isNonNeg();
>From ce95f18efeb972629cfe0b415475f3726e91fd52 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Mon, 5 May 2025 07:22:17 -0700
Subject: [PATCH 3/9] !fixup, Remove `computeCost()` for new recipes.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 11 ++----
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 39 ++++---------------
.../vplan-printing-reductions.ll | 12 +++---
3 files changed, 16 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index b3566f8ea4976..5de910046ef89 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2642,6 +2642,7 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
// Not all WidenCastRecipes contain nneg flag. Need to transfer flags from
// the original recipe to prevent setting wrong flags.
transferFlags(*Ext);
+ setUnderlyingValue(R->getUnderlyingValue());
}
~VPExtendedReductionRecipe() override = default;
@@ -2657,10 +2658,6 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
"VPExtendedRecipe + VPReductionRecipe before execution.");
};
- /// Return the cost of VPExtendedReductionRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
@@ -2720,6 +2717,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
assert((ExtOp == Instruction::CastOps::ZExt ||
ExtOp == Instruction::CastOps::SExt) &&
"VPMulAccumulateReductionRecipe only support zext and sext.");
+ setUnderlyingValue(R->getUnderlyingValue());
// Only set the non-negative flag if the original recipe contains.
if (Ext0->hasNonNegFlag())
IsNonNeg = Ext0->isNonNeg();
@@ -2737,6 +2735,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
Instruction::Add &&
"The reduction instruction in MulAccumulateReductionRecipe must be "
"Add");
+ setUnderlyingValue(R->getUnderlyingValue());
}
~VPMulAccumulateReductionRecipe() override = default;
@@ -2755,10 +2754,6 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
"VPWidenRecipe + VPReductionRecipe before execution");
}
- /// Return the cost of VPMulAccumulateReductionRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 04eab649713bc..f4093af7377f8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2527,8 +2527,6 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
FastMathFlags FMFs = getFastMathFlags();
- std::optional<FastMathFlags> OptionalFMF =
- ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
// TODO: Support any-of reductions.
assert(
@@ -2536,40 +2534,17 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
ForceTargetInstructionCost.getNumOccurrences() > 0) &&
"Any-of reduction not implemented in VPlan-based cost model currently.");
+ // Cost = Reduction cost + BinOp cost
+ InstructionCost Cost =
+ Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
- return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
+ return Cost +
+ Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
}
- return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
- Ctx.CostKind);
-}
-
-InstructionCost
-VPExtendedReductionRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
- unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind());
- Type *RedTy = Ctx.Types.inferScalarType(this);
- auto *SrcVecTy =
- cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF));
- assert(RedTy->isIntegerTy() &&
- "ExtendedReduction only support integer type currently.");
- InstructionCost Cost = Ctx.TTI.getExtendedReductionCost(
- Opcode, isZExt(), RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
- // The cost of this recipe should be decided by the legacy model.
- return Cost.isValid() ? 0 : Cost;
-}
-
-InstructionCost
-VPMulAccumulateReductionRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
- Type *RedTy = Ctx.Types.inferScalarType(this);
- auto *SrcVecTy =
- cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF));
- InstructionCost Cost =
- Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy, Ctx.CostKind);
- // The cost of this recipe should be decided by the legacy model.
- return Cost.isValid() ? 0 : Cost;
+ return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs,
+ Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index cf920c91913fb..307228220cb15 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -283,12 +283,12 @@ define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture re
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx>
; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]>
-; CHECK-NEXT: EXTENDED-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> extended to i64)
+; CHECK-NEXT: EXTENDED-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> extended to i64)
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
@@ -327,7 +327,7 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
@@ -335,7 +335,7 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i
; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
-; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>)
+; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>)
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
@@ -376,7 +376,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
@@ -384,7 +384,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado
; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
-; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> extended to i64), (ir<[[LOAD1]]> extended to i64))
+; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> extended to i64), (ir<[[LOAD1]]> extended to i64))
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
>From d2674119cf9fd8377563933ce54cff82cb1ee545 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Wed, 7 May 2025 16:26:53 -0700
Subject: [PATCH 4/9] !fixup, address comments.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 34 ++++++++++---------
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 +++--
.../Transforms/Vectorize/VPlanTransforms.cpp | 23 ++++++-------
.../vplan-printing-reductions.ll | 4 +--
4 files changed, 37 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5de910046ef89..cc81f376c41ec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1311,8 +1311,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
Opcode(I.getOpcode()) {}
template <typename IterT>
- VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode,
- iterator_range<IterT> Operands, bool NUW, bool NSW, DebugLoc DL)
+ VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode, ArrayRef<IterT> Operands,
+ bool NUW, bool NSW, DebugLoc DL)
: VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL),
Opcode(Opcode) {}
@@ -1321,8 +1321,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
: VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
template <typename IterT>
- VPWidenRecipe(unsigned Opcode, iterator_range<IterT> Operands, bool NUW,
- bool NSW, DebugLoc DL)
+ VPWidenRecipe(unsigned Opcode, ArrayRef<IterT> Operands, bool NUW, bool NSW,
+ DebugLoc DL)
: VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {}
~VPWidenRecipe() override = default;
@@ -2614,9 +2614,10 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
/// concrete recipes before codegen. The operands are {ChainOp, VecOp,
/// [Condition]}.
class VPExtendedReductionRecipe : public VPReductionRecipe {
- /// Opcode of the extend recipe will be lowered to.
+ /// Opcode of the extend for VecOp.
Instruction::CastOps ExtOp;
+ /// The scalar type after extending.
Type *ResultTy;
/// For cloning VPExtendedReductionRecipe.
@@ -2637,10 +2638,8 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) {
assert((ExtOp == Instruction::CastOps::ZExt ||
ExtOp == Instruction::CastOps::SExt) &&
- "VPExtendedReductionRecipe only support zext and sext.");
+ "VPExtendedReductionRecipe only supports zext and sext.");
- // Not all WidenCastRecipes contain nneg flag. Need to transfer flags from
- // the original recipe to prevent setting wrong flags.
transferFlags(*Ext);
setUnderlyingValue(R->getUnderlyingValue());
}
@@ -2670,7 +2669,7 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
/// Is the extend ZExt?
bool isZExt() const { return getExtOpcode() == Instruction::ZExt; }
- /// The opcode of extend recipe.
+ /// Get the opcode of the extend for VecOp.
Instruction::CastOps getExtOpcode() const { return ExtOp; }
};
@@ -2680,12 +2679,13 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
/// recipe is abstract and needs to be lowered to concrete recipes before
/// codegen. The operands are {ChainOp, VecOp1, VecOp2, [Condition]}.
class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
- /// Opcode of the extend recipe.
+ /// Opcode of the extend for VecOp1 and VecOp2.
Instruction::CastOps ExtOp;
/// Non-neg flag of the extend recipe.
bool IsNonNeg = false;
+ /// The scalar type after extending.
Type *ResultTy;
/// For cloning VPMulAccumulateReductionRecipe.
@@ -2716,7 +2716,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
"be Add");
assert((ExtOp == Instruction::CastOps::ZExt ||
ExtOp == Instruction::CastOps::SExt) &&
- "VPMulAccumulateReductionRecipe only support zext and sext.");
+ "VPMulAccumulateReductionRecipe only supports zext and sext.");
setUnderlyingValue(R->getUnderlyingValue());
// Only set the non-negative flag if the original recipe contains.
if (Ext0->hasNonNegFlag())
@@ -2762,24 +2762,26 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
Type *getResultType() const {
assert(isExtended() && "Only support getResultType when this recipe "
- "contains implicit extend.");
+ "is implicitly extend.");
return ResultTy;
}
- /// The VPValue of the vector value to be extended and reduced.
+ /// The first vector value to be extended and reduced.
VPValue *getVecOp0() const { return getOperand(1); }
+
+ /// The second vector value to be extended and reduced.
VPValue *getVecOp1() const { return getOperand(2); }
- /// Return if this MulAcc recipe contains extended operands.
+ /// Return true if this recipe contains extended operands.
bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; }
/// Return the opcode of the extends for the operands.
Instruction::CastOps getExtOpcode() const { return ExtOp; }
- /// Return if the operands are zero extended.
+ /// Return if the operands are zero-extended.
bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; }
- /// Return the non negative flag of the ext recipe.
+ /// Return true if the operand extends have the non-negative flag.
bool isNonNeg() const { return IsNonNeg; }
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f4093af7377f8..fc1ee89e81c75 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2602,7 +2602,8 @@ void VPExtendedReductionRecipe::print(raw_ostream &O, const Twine &Indent,
RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
<< " (";
getVecOp()->printAsOperand(O, SlotTracker);
- O << " extended to " << *getResultType();
+ printFlags(O);
+ O << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType();
if (isConditional()) {
O << ", ";
getCondOp()->printAsOperand(O, SlotTracker);
@@ -2627,12 +2628,14 @@ void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent,
O << "(";
getVecOp0()->printAsOperand(O, SlotTracker);
if (isExtended())
- O << " extended to " << *getResultType() << "), (";
+ O << " " << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType()
+ << "), (";
else
O << ", ";
getVecOp1()->printAsOperand(O, SlotTracker);
if (isExtended())
- O << " extended to " << *getResultType() << ")";
+ O << " " << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType()
+ << ")";
if (isConditional()) {
O << ", ";
getCondOp()->printAsOperand(O, SlotTracker);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 5e56104c875af..87fa4f268ea15 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2419,7 +2419,7 @@ static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) {
static void
expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) {
// Generate inner VPWidenCastRecipes if necessary.
- // Note that we will drop the extend after mul which transform
+ // Note that we will drop the extend after mul which transforms
// reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
VPValue *Op0, *Op1;
if (MulAcc->isExtended()) {
@@ -2454,9 +2454,8 @@ expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) {
std::array<VPValue *, 2> MulOps = {Op0, Op1};
auto *Mul = new VPWidenRecipe(
- Instruction::Mul, make_range(MulOps.begin(), MulOps.end()),
- MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap(),
- MulAcc->getDebugLoc());
+ Instruction::Mul, ArrayRef(MulOps), MulAcc->hasNoUnsignedWrap(),
+ MulAcc->hasNoSignedWrap(), MulAcc->getDebugLoc());
Mul->insertBefore(MulAcc);
auto *Red = new VPReductionRecipe(
@@ -2688,6 +2687,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
VPCostContext &Ctx, VFRange &Range) {
using namespace VPlanPatternMatch;
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
+ if (Opcode != Instruction::Add)
+ return nullptr;
+
Type *RedTy = Ctx.Types.inferScalarType(Red);
// Clamp the range if using multiply-accumulate-reduction is profitable.
@@ -2718,13 +2721,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
Range);
};
- unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
- if (Opcode != Instruction::Add)
- return nullptr;
-
VPValue *VecOp = Red->getVecOp();
VPValue *A, *B;
- // Try to match reduce.add(mul(...))
+ // Try to match reduce.add(mul(...)).
if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
auto *RecipeA =
dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
@@ -2732,7 +2731,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
- // Match reduce.add(mul(ext, ext))
+ // Match reduce.add(mul(ext, ext)).
if (RecipeA && RecipeB &&
(RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
@@ -2742,11 +2741,11 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
Mul, RecipeA, RecipeB, nullptr))
return new VPMulAccumulateReductionRecipe(Red, Mul, RecipeA, RecipeB,
RecipeA->getResultType());
- // Match reduce.add(mul)
+ // Match reduce.add(mul).
if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
return new VPMulAccumulateReductionRecipe(Red, Mul);
}
- // Match reduce.add(ext(mul(ext(A), ext(B))))
+ // Match reduce.add(ext(mul(ext(A), ext(B)))).
// All extend recipes must have same opcode or A == B
// which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 307228220cb15..da42d62d39c2e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -288,7 +288,7 @@ define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture re
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx>
; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]>
-; CHECK-NEXT: EXTENDED-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> extended to i64)
+; CHECK-NEXT: EXTENDED-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> zext to i64)
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
@@ -384,7 +384,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado
; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
-; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> extended to i64), (ir<[[LOAD1]]> extended to i64))
+; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD1]]> sext to i64))
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
>From 06ef08793483c35ee0e9ac94cadad149654deca4 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Wed, 7 May 2025 17:21:27 -0700
Subject: [PATCH 5/9] !fixup, fix assertion of getResultType().
---
llvm/lib/Transforms/Vectorize/VPlan.h | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index cc81f376c41ec..d430fb1969a9a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2696,8 +2696,10 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
MulAcc->getCondOp(), MulAcc->isOrdered(),
WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()),
MulAcc->getDebugLoc()),
- ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()),
- ResultTy(MulAcc->getResultType()) {}
+ ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()) {
+ if (MulAcc->isExtended())
+ ResultTy = MulAcc->getResultType();
+ }
public:
VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
>From 34a6f3b5c4a876bf4ddb1b7d86686584e60a7628 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Thu, 8 May 2025 17:15:04 -0700
Subject: [PATCH 6/9] !fixup getResultType() in VPMulAccumulateReductionRecipe.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 8 ++------
llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 6 +++++-
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 +++-
3 files changed, 10 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d430fb1969a9a..973ef4de44efb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2686,7 +2686,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
bool IsNonNeg = false;
/// The scalar type after extending.
- Type *ResultTy;
+ Type *ResultTy = nullptr;
/// For cloning VPMulAccumulateReductionRecipe.
VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc)
@@ -2762,11 +2762,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
VPSlotTracker &SlotTracker) const override;
#endif
- Type *getResultType() const {
- assert(isExtended() && "Only support getResultType when this recipe "
- "is implicitly extend.");
- return ResultTy;
- }
+ Type *getResultType() const { return ResultTy; }
/// The first vector value to be extended and reduced.
VPValue *getVecOp0() const { return getOperand(1); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 7dcbd72c25191..cd8878fce26d9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -273,8 +273,12 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
// TODO: Use info from interleave group.
return V->getUnderlyingValue()->getType();
})
- .Case<VPExtendedReductionRecipe, VPMulAccumulateReductionRecipe>(
+ .Case<VPExtendedReductionRecipe>(
[](const auto *R) { return R->getResultType(); })
+ .Case<VPMulAccumulateReductionRecipe>([this](const auto *R) {
+ return R->isExtended() ? R->getResultType()
+ : inferScalarType(R->getOperand(0));
+ })
.Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
return R->getSCEV()->getType();
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 87fa4f268ea15..98af08fc65426 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2530,8 +2530,10 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
ToRemove.push_back(VPI);
}
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (auto *ExtRed = dyn_cast<VPExtendedReductionRecipe>(&R))
+ if (auto *ExtRed = dyn_cast<VPExtendedReductionRecipe>(&R)) {
expandVPExtendedReduction(ExtRed);
+ continue;
+ }
if (auto *MulAcc = dyn_cast<VPMulAccumulateReductionRecipe>(&R))
expandVPMulAccumulateReduction(MulAcc);
}
>From bfc5fc2e2f85d6efefcec81d726f5df7198b8358 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Mon, 12 May 2025 09:04:08 -0700
Subject: [PATCH 7/9] Fixup! Remove IterT and always add result type in
VPMulAccumulateReductionRecipe.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 15 +++++++--------
llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 6 +-----
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
3 files changed, 9 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 973ef4de44efb..ecc79653df296 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1310,9 +1310,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
: VPRecipeWithIRFlags(VPDefOpcode, Operands, I), VPIRMetadata(I),
Opcode(I.getOpcode()) {}
- template <typename IterT>
- VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode, ArrayRef<IterT> Operands,
- bool NUW, bool NSW, DebugLoc DL)
+ VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode,
+ ArrayRef<VPValue *> Operands, bool NUW, bool NSW, DebugLoc DL)
: VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL),
Opcode(Opcode) {}
@@ -1320,9 +1319,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands)
: VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
- template <typename IterT>
- VPWidenRecipe(unsigned Opcode, ArrayRef<IterT> Operands, bool NUW, bool NSW,
- DebugLoc DL)
+ VPWidenRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands, bool NUW,
+ bool NSW, DebugLoc DL)
: VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {}
~VPWidenRecipe() override = default;
@@ -2725,14 +2723,15 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
IsNonNeg = Ext0->isNonNeg();
}
- VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul)
+ VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
+ Type *ResultTy)
: VPReductionRecipe(
VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
{R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)},
R->getCondOp(), R->isOrdered(),
WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
R->getDebugLoc()),
- ExtOp(Instruction::CastOps::CastOpsEnd) {
+ ExtOp(Instruction::CastOps::CastOpsEnd), ResultTy(ResultTy) {
assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
Instruction::Add &&
"The reduction instruction in MulAccumulateReductionRecipe must be "
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index cd8878fce26d9..7dcbd72c25191 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -273,12 +273,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
// TODO: Use info from interleave group.
return V->getUnderlyingValue()->getType();
})
- .Case<VPExtendedReductionRecipe>(
+ .Case<VPExtendedReductionRecipe, VPMulAccumulateReductionRecipe>(
[](const auto *R) { return R->getResultType(); })
- .Case<VPMulAccumulateReductionRecipe>([this](const auto *R) {
- return R->isExtended() ? R->getResultType()
- : inferScalarType(R->getOperand(0));
- })
.Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
return R->getSCEV()->getType();
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 98af08fc65426..81accab2be54e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2745,7 +2745,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
RecipeA->getResultType());
// Match reduce.add(mul).
if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
- return new VPMulAccumulateReductionRecipe(Red, Mul);
+ return new VPMulAccumulateReductionRecipe(Red, Mul, RedTy);
}
// Match reduce.add(ext(mul(ext(A), ext(B)))).
// All extend recipes must have same opcode or A == B
>From a0515c33c4014299d22f29e6f5be4233402412c1 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Mon, 12 May 2025 09:19:36 -0700
Subject: [PATCH 8/9] !fixup update VPMulAccumulateReduction::clone().
---
llvm/lib/Transforms/Vectorize/VPlan.h | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ecc79653df296..4cbf0e82ea701 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2694,10 +2694,8 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
MulAcc->getCondOp(), MulAcc->isOrdered(),
WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()),
MulAcc->getDebugLoc()),
- ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()) {
- if (MulAcc->isExtended())
- ResultTy = MulAcc->getResultType();
- }
+ ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()),
+ ResultTy(MulAcc->getResultType()) {}
public:
VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
>From fca5a28590c49896098a7ab1763c4943be81c041 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Wed, 14 May 2025 23:21:58 -0700
Subject: [PATCH 9/9] Address comments.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4cbf0e82ea701..0c549af66a751 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2626,6 +2626,7 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
ExtRed->isOrdered(), ExtRed->getDebugLoc()),
ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) {
transferFlags(*ExtRed);
+ setUnderlyingValue(ExtRed->getUnderlyingValue());
}
public:
@@ -2671,11 +2672,11 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
Instruction::CastOps getExtOpcode() const { return ExtOp; }
};
-/// A recipe to represent inloop MulAccumulateReduction operations, performing a
-/// reduction.add on the result of vector operands (might be extended)
-/// multiplication into a scalar value, and adding the result to a chain. This
-/// recipe is abstract and needs to be lowered to concrete recipes before
-/// codegen. The operands are {ChainOp, VecOp1, VecOp2, [Condition]}.
+/// A recipe to represent inloop MulAccumulateReduction operations, multiplying
+/// the vector operands (which may be extended), performing a reduction.add on
+/// the result, and adding the scalar result to a chain. This recipe is abstract
+/// and needs to be lowered to concrete recipes before codegen. The operands are
+/// {ChainOp, VecOp1, VecOp2, [Condition]}.
class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
/// Opcode of the extend for VecOp1 and VecOp2.
Instruction::CastOps ExtOp;
@@ -2695,7 +2696,10 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()),
MulAcc->getDebugLoc()),
ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()),
- ResultTy(MulAcc->getResultType()) {}
+ ResultTy(MulAcc->getResultType()) {
+ transferFlags(*MulAcc);
+ setUnderlyingValue(MulAcc->getUnderlyingValue());
+ }
public:
VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
@@ -2740,9 +2744,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
~VPMulAccumulateReductionRecipe() override = default;
VPMulAccumulateReductionRecipe *clone() override {
- auto *Copy = new VPMulAccumulateReductionRecipe(this);
- Copy->transferFlags(*this);
- return Copy;
+ return new VPMulAccumulateReductionRecipe(this);
}
VP_CLASSOF_IMPL(VPDef::VPMulAccumulateReductionSC);
More information about the llvm-commits
mailing list