[llvm] [VPlan] Implement VPExtendedReduction, VPMulAccumulateReductionRecipe and corresponding vplan transformations. (PR #137746)
Elvis Wang via llvm-commits
llvm-commits at lists.llvm.org
Mon May 5 07:29:18 PDT 2025
https://github.com/ElvisWang123 updated https://github.com/llvm/llvm-project/pull/137746
>From c95c790bbaaa9bf07356980a80a32c69d6013de7 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 27 Apr 2025 23:32:25 -0700
Subject: [PATCH 1/3] [VPlan] Implement transformation for widen-cast/widen-mul
+ reduction to abstract recipe.
This patch introduce two new recipes.
* VPExtendedReductionRecipe
- cast + reduction.
* VPMulAccumulateReductionRecipe
- (cast) + mul + reduction.
This patch also implements the transformation that match following
patterns via vplan and converts to abstract recipes for better cost
estimation.
* VPExtendedReduction
- reduce(cast(...))
* VPMulAccumulateReductionRecipe
- reduce.add(mul(...))
- reduce.add(mul(ext(...), ext(...))
- reduce.add(ext(mul(ext(...), ext(...))))
The conveted abstract recipes will be lower to the concrete recipes
(widen-cast + widen-mul + reduction) just before recipe execution.
Split from #113903.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 19 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 259 +++++++++++++++++-
.../Transforms/Vectorize/VPlanAnalysis.cpp | 2 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 101 ++++++-
.../Transforms/Vectorize/VPlanTransforms.cpp | 247 +++++++++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 7 +
llvm/lib/Transforms/Vectorize/VPlanValue.h | 2 +
.../LoopVectorize/ARM/mve-reduction-types.ll | 4 +-
.../LoopVectorize/ARM/mve-reductions.ll | 120 ++++----
.../LoopVectorize/reduction-inloop-pred.ll | 2 +-
.../LoopVectorize/reduction-inloop.ll | 8 +-
.../vplan-printing-reductions.ll | 145 ++++++++++
12 files changed, 838 insertions(+), 78 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 53a900b3c5f8a..a0da55ce3514d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9631,10 +9631,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock");
- for (ElementCount VF : Range)
- Plan->addVF(VF);
- Plan->setName("Initial VPlan");
-
// Update wide induction increments to use the same step as the corresponding
// wide induction. This enables detecting induction increments directly in
// VPlan and removes redundant splats.
@@ -9670,6 +9666,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
+ // Transform recipes to abstract recipes if it is legal and beneficial and
+ // clamp the range for better cost estimation.
+ // TODO: Enable following transform when the EVL-version of extended-reduction
+ // and mulacc-reduction are implemented.
+ if (!CM.foldTailWithEVL()) {
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+ CM.CostKind);
+ VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
+ CostCtx, Range);
+ }
+
+ for (ElementCount VF : Range)
+ Plan->addVF(VF);
+ Plan->setName("Initial VPlan");
+
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 067a723a3aa4d..7c1b7948bdf0b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -525,6 +525,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
+ case VPRecipeBase::VPMulAccumulateReductionSC:
+ case VPRecipeBase::VPExtendedReductionSC:
case VPRecipeBase::VPReplicateSC:
case VPRecipeBase::VPScalarIVStepsSC:
case VPRecipeBase::VPVectorPointerSC:
@@ -609,13 +611,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
};
+ struct NonNegFlagsTy {
+ char NonNeg : 1;
+ NonNegFlagsTy(bool IsNonNeg) : NonNeg(IsNonNeg) {}
+ };
+
private:
struct ExactFlagsTy {
char IsExact : 1;
};
- struct NonNegFlagsTy {
- char NonNeg : 1;
- };
struct FastMathFlagsTy {
char AllowReassoc : 1;
char NoNaNs : 1;
@@ -709,6 +713,12 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
: VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp),
DisjointFlags(DisjointFlags) {}
+ template <typename IterT>
+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+ NonNegFlagsTy NonNegFlags, DebugLoc DL = {})
+ : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::NonNegOp),
+ NonNegFlags(NonNegFlags) {}
+
protected:
template <typename IterT>
VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
@@ -728,7 +738,9 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
- R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
+ R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
+ R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
+ R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
}
static inline bool classof(const VPUser *U) {
@@ -820,6 +832,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
FastMathFlags getFastMathFlags() const;
+ /// Returns true if the recipe has non-negative flag.
+ bool hasNonNegFlag() const { return OpType == OperationType::NonNegOp; }
+
+ bool isNonNeg() const {
+ assert(OpType == OperationType::NonNegOp &&
+ "recipe doesn't have a NNEG flag");
+ return NonNegFlags.NonNeg;
+ }
+
bool hasNoUnsignedWrap() const {
assert(OpType == OperationType::OverflowingBinOp &&
"recipe doesn't have a NUW flag");
@@ -1231,11 +1252,22 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
: VPRecipeWithIRFlags(VPDefOpcode, Operands, I), VPIRMetadata(I),
Opcode(I.getOpcode()) {}
+ template <typename IterT>
+ VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode,
+ iterator_range<IterT> Operands, bool NUW, bool NSW, DebugLoc DL)
+ : VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL),
+ Opcode(Opcode) {}
+
public:
template <typename IterT>
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
: VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
+ template <typename IterT>
+ VPWidenRecipe(unsigned Opcode, iterator_range<IterT> Operands, bool NUW,
+ bool NSW, DebugLoc DL)
+ : VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {}
+
~VPWidenRecipe() override = default;
VPWidenRecipe *clone() override {
@@ -1280,8 +1312,15 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
"opcode of underlying cast doesn't match");
}
- VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
- : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), VPIRMetadata(),
+ VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
+ DebugLoc DL = {})
+ : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, DL), VPIRMetadata(),
+ Opcode(Opcode), ResultTy(ResultTy) {}
+
+ VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
+ bool IsNonNeg, DebugLoc DL = {})
+ : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, NonNegFlagsTy(IsNonNeg),
+ DL),
Opcode(Opcode), ResultTy(ResultTy) {}
~VPWidenCastRecipe() override = default;
@@ -2376,6 +2415,28 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
setUnderlyingValue(I);
}
+ /// For VPExtendedReductionRecipe.
+ /// Note that the debug location is from the extend.
+ VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
+ ArrayRef<VPValue *> Operands, VPValue *CondOp,
+ bool IsOrdered, DebugLoc DL)
+ : VPRecipeWithIRFlags(SC, Operands, DL), RdxKind(RdxKind),
+ IsOrdered(IsOrdered), IsConditional(CondOp) {
+ if (CondOp)
+ addOperand(CondOp);
+ }
+
+ /// For VPMulAccumulateReductionRecipe.
+ /// Note that the NUW/NSW flags and the debug location are from the Mul.
+ VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
+ ArrayRef<VPValue *> Operands, VPValue *CondOp,
+ bool IsOrdered, WrapFlagsTy WrapFlags, DebugLoc DL)
+ : VPRecipeWithIRFlags(SC, Operands, WrapFlags, DL), RdxKind(RdxKind),
+ IsOrdered(IsOrdered), IsConditional(CondOp) {
+ if (CondOp)
+ addOperand(CondOp);
+ }
+
public:
VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
@@ -2384,6 +2445,13 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
IsOrdered, DL) {}
+ VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs,
+ VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
+ bool IsOrdered, DebugLoc DL = {})
+ : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr,
+ ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
+ IsOrdered, DL) {}
+
~VPReductionRecipe() override = default;
VPReductionRecipe *clone() override {
@@ -2394,7 +2462,9 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
- R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
+ R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
+ R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
+ R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
}
static inline bool classof(const VPUser *U) {
@@ -2474,6 +2544,181 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
}
};
+/// A recipe to represent inloop extended reduction operations, performing a
+/// reduction on a extended vector operand into a scalar value, and adding the
+/// result to a chain. This recipe is abstract and needs to be lowered to
+/// concrete recipes before codegen. The operands are {ChainOp, VecOp,
+/// [Condition]}.
+class VPExtendedReductionRecipe : public VPReductionRecipe {
+ /// Opcode of the extend recipe will be lowered to.
+ Instruction::CastOps ExtOp;
+
+ Type *ResultTy;
+
+ /// For cloning VPExtendedReductionRecipe.
+ VPExtendedReductionRecipe(VPExtendedReductionRecipe *ExtRed)
+ : VPReductionRecipe(
+ VPDef::VPExtendedReductionSC, ExtRed->getRecurrenceKind(),
+ {ExtRed->getChainOp(), ExtRed->getVecOp()}, ExtRed->getCondOp(),
+ ExtRed->isOrdered(), ExtRed->getDebugLoc()),
+ ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) {
+ transferFlags(*ExtRed);
+ }
+
+public:
+ VPExtendedReductionRecipe(VPReductionRecipe *R, VPWidenCastRecipe *Ext)
+ : VPReductionRecipe(VPDef::VPExtendedReductionSC, R->getRecurrenceKind(),
+ {R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(),
+ R->isOrdered(), Ext->getDebugLoc()),
+ ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) {
+ // Not all WidenCastRecipes contain nneg flag. Need to transfer flags from
+ // the original recipe to prevent setting wrong flags.
+ transferFlags(*Ext);
+ }
+
+ ~VPExtendedReductionRecipe() override = default;
+
+ VPExtendedReductionRecipe *clone() override {
+ auto *Copy = new VPExtendedReductionRecipe(this);
+ Copy->transferFlags(*this);
+ return Copy;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC);
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPExtendedReductionRecipe should be transform to "
+ "VPExtendedRecipe + VPReductionRecipe before execution.");
+ };
+
+ /// Return the cost of VPExtendedReductionRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// The scalar type after extending.
+ Type *getResultType() const { return ResultTy; }
+
+ /// Is the extend ZExt?
+ bool isZExt() const { return getExtOpcode() == Instruction::ZExt; }
+
+ /// The opcode of extend recipe.
+ Instruction::CastOps getExtOpcode() const { return ExtOp; }
+};
+
+/// A recipe to represent inloop MulAccumulateReduction operations, performing a
+/// reduction.add on the result of vector operands (might be extended)
+/// multiplication into a scalar value, and adding the result to a chain. This
+/// recipe is abstract and needs to be lowered to concrete recipes before
+/// codegen. The operands are {ChainOp, VecOp1, VecOp2, [Condition]}.
+class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
+ /// Opcode of the extend recipe.
+ Instruction::CastOps ExtOp;
+
+ /// Non-neg flag of the extend recipe.
+ bool IsNonNeg = false;
+
+ Type *ResultTy;
+
+ /// For cloning VPMulAccumulateReductionRecipe.
+ VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc)
+ : VPReductionRecipe(
+ VPDef::VPMulAccumulateReductionSC, MulAcc->getRecurrenceKind(),
+ {MulAcc->getChainOp(), MulAcc->getVecOp0(), MulAcc->getVecOp1()},
+ MulAcc->getCondOp(), MulAcc->isOrdered(),
+ WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()),
+ MulAcc->getDebugLoc()),
+ ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()),
+ ResultTy(MulAcc->getResultType()) {}
+
+public:
+ VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
+ VPWidenCastRecipe *Ext0,
+ VPWidenCastRecipe *Ext1, Type *ResultTy)
+ : VPReductionRecipe(
+ VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
+ {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)},
+ R->getCondOp(), R->isOrdered(),
+ WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
+ R->getDebugLoc()),
+ ExtOp(Ext0->getOpcode()), ResultTy(ResultTy) {
+ assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
+ Instruction::Add &&
+ "The reduction instruction in MulAccumulateteReductionRecipe must "
+ "be Add");
+ // Only set the non-negative flag if the original recipe contains.
+ if (Ext0->hasNonNegFlag())
+ IsNonNeg = Ext0->isNonNeg();
+ }
+
+ VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul)
+ : VPReductionRecipe(
+ VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
+ {R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)},
+ R->getCondOp(), R->isOrdered(),
+ WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
+ R->getDebugLoc()),
+ ExtOp(Instruction::CastOps::CastOpsEnd) {
+ assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
+ Instruction::Add &&
+ "The reduction instruction in MulAccumulateReductionRecipe must be "
+ "Add");
+ }
+
+ ~VPMulAccumulateReductionRecipe() override = default;
+
+ VPMulAccumulateReductionRecipe *clone() override {
+ auto *Copy = new VPMulAccumulateReductionRecipe(this);
+ Copy->transferFlags(*this);
+ return Copy;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPMulAccumulateReductionSC);
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPMulAccumulateReductionRecipe should transform to "
+ "VPWidenCastRecipe + "
+ "VPWidenRecipe + VPReductionRecipe before execution");
+ }
+
+ /// Return the cost of VPMulAccumulateReductionRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ Type *getResultType() const {
+ assert(isExtended() && "Only support getResultType when this recipe "
+ "contains implicit extend.");
+ return ResultTy;
+ }
+
+ /// The VPValue of the vector value to be extended and reduced.
+ VPValue *getVecOp0() const { return getOperand(1); }
+ VPValue *getVecOp1() const { return getOperand(2); }
+
+ /// Return if this MulAcc recipe contains extended operands.
+ bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; }
+
+ /// Return the opcode of the extends for the operands.
+ Instruction::CastOps getExtOpcode() const { return ExtOp; }
+
+ /// Return if the operands are zero extended.
+ bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; }
+
+ /// Return the non negative flag of the ext recipe.
+ bool isNonNeg() const { return IsNonNeg; }
+};
+
/// VPReplicateRecipe replicates a given instruction producing multiple scalar
/// copies of the original scalar type, one per lane, instead of producing a
/// single copy of widened type for all lanes. If the instruction is known to be
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index c86815c84d8d9..7dcbd72c25191 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -273,6 +273,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
// TODO: Use info from interleave group.
return V->getUnderlyingValue()->getType();
})
+ .Case<VPExtendedReductionRecipe, VPMulAccumulateReductionRecipe>(
+ [](const auto *R) { return R->getResultType(); })
.Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
return R->getSCEV()->getType();
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 54990c7c806c4..7fb5ea5cc4b93 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -71,6 +71,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
+ case VPExtendedReductionSC:
+ case VPMulAccumulateReductionSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -118,6 +120,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
+ case VPExtendedReductionSC:
+ case VPMulAccumulateReductionSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -155,6 +159,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
+ case VPExtendedReductionSC:
+ case VPMulAccumulateReductionSC:
case VPScalarIVStepsSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
@@ -2480,28 +2486,49 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
FastMathFlags FMFs = getFastMathFlags();
+ std::optional<FastMathFlags> OptionalFMF =
+ ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
- // TODO: Support any-of and in-loop reductions.
+ // TODO: Support any-of reductions.
assert(
(!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
ForceTargetInstructionCost.getNumOccurrences() > 0) &&
"Any-of reduction not implemented in VPlan-based cost model currently.");
- assert(
- (!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() ||
- ForceTargetInstructionCost.getNumOccurrences() > 0) &&
- "In-loop reduction not implemented in VPlan-based cost model currently.");
- // Cost = Reduction cost + BinOp cost
- InstructionCost Cost =
- Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
- return Cost +
- Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
+ return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
}
- return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs,
- Ctx.CostKind);
+ return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
+ Ctx.CostKind);
+}
+
+InstructionCost
+VPExtendedReductionRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind());
+ Type *RedTy = Ctx.Types.inferScalarType(this);
+ auto *SrcVecTy =
+ cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF));
+ assert(RedTy->isIntegerTy() &&
+ "ExtendedReduction only support integer type currently.");
+ InstructionCost Cost = Ctx.TTI.getExtendedReductionCost(
+ Opcode, isZExt(), RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
+ // The cost of this recipe should be decided by the legacy model.
+ return Cost.isValid() ? 0 : Cost;
+}
+
+InstructionCost
+VPMulAccumulateReductionRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ Type *RedTy = Ctx.Types.inferScalarType(this);
+ auto *SrcVecTy =
+ cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF));
+ InstructionCost Cost =
+ Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy, Ctx.CostKind);
+ // The cost of this recipe should be decided by the legacy model.
+ return Cost.isValid() ? 0 : Cost;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2546,6 +2573,56 @@ void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
}
O << ")";
}
+
+void VPExtendedReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EXTENDED-REDUCE ";
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ getChainOp()->printAsOperand(O, SlotTracker);
+ O << " +";
+ O << " reduce."
+ << Instruction::getOpcodeName(
+ RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
+ << " (";
+ getVecOp()->printAsOperand(O, SlotTracker);
+ O << " extended to " << *getResultType();
+ if (isConditional()) {
+ O << ", ";
+ getCondOp()->printAsOperand(O, SlotTracker);
+ }
+ O << ")";
+}
+
+void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "MULACC-REDUCE ";
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ getChainOp()->printAsOperand(O, SlotTracker);
+ O << " + ";
+ O << "reduce."
+ << Instruction::getOpcodeName(
+ RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
+ << " (";
+ O << "mul";
+ printFlags(O);
+ if (isExtended())
+ O << "(";
+ getVecOp0()->printAsOperand(O, SlotTracker);
+ if (isExtended())
+ O << " extended to " << *getResultType() << "), (";
+ else
+ O << ", ";
+ getVecOp1()->printAsOperand(O, SlotTracker);
+ if (isExtended())
+ O << " extended to " << *getResultType() << ")";
+ if (isConditional()) {
+ O << ", ";
+ getCondOp()->printAsOperand(O, SlotTracker);
+ }
+ O << ")";
+}
#endif
bool VPReplicateRecipe::shouldPack() const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7093d378d8c3e..b3d9de5f28317 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2420,6 +2420,82 @@ void VPlanTransforms::createInterleaveGroups(
}
}
+// Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe.
+static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) {
+ VPWidenCastRecipe *Ext;
+ // Only ZExt contains non-neg flags.
+ if (ExtRed->isZExt())
+ Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(),
+ ExtRed->getResultType(), ExtRed->isNonNeg(),
+ ExtRed->getDebugLoc());
+ else
+ Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(),
+ ExtRed->getResultType(), ExtRed->getDebugLoc());
+
+ auto *Red = new VPReductionRecipe(
+ ExtRed->getRecurrenceKind(), FastMathFlags(), ExtRed->getChainOp(), Ext,
+ ExtRed->getCondOp(), ExtRed->isOrdered(), ExtRed->getDebugLoc());
+ Ext->insertBefore(ExtRed);
+ Red->insertBefore(ExtRed);
+ ExtRed->replaceAllUsesWith(Red);
+ ExtRed->eraseFromParent();
+}
+
+// Expand VPMulAccumulateReductionRecipe to VPWidenRecipe (mul) +
+// VPReductionRecipe (reduce.add)
+// + VPWidenCastRecipe (optional).
+static void
+expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) {
+ // Generate inner VPWidenCastRecipes if necessary.
+ // Note that we will drop the extend after mul which transform
+ // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
+ VPValue *Op0, *Op1;
+ if (MulAcc->isExtended()) {
+ Type *RedTy = MulAcc->getResultType();
+ if (MulAcc->isZExt())
+ Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(),
+ RedTy, MulAcc->isNonNeg(),
+ MulAcc->getDebugLoc());
+ else
+ Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(),
+ RedTy, MulAcc->getDebugLoc());
+ Op0->getDefiningRecipe()->insertBefore(MulAcc);
+ // Prevent reduce.add(mul(ext(A), ext(A))) generate duplicate
+ // VPWidenCastRecipe.
+ if (MulAcc->getVecOp0() == MulAcc->getVecOp1()) {
+ Op1 = Op0;
+ } else {
+ if (MulAcc->isZExt())
+ Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(),
+ RedTy, MulAcc->isNonNeg(),
+ MulAcc->getDebugLoc());
+ else
+ Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(),
+ RedTy, MulAcc->getDebugLoc());
+ Op1->getDefiningRecipe()->insertBefore(MulAcc);
+ }
+ } else {
+ // No extends in this MulAccRecipe.
+ Op0 = MulAcc->getVecOp0();
+ Op1 = MulAcc->getVecOp1();
+ }
+
+ std::array<VPValue *, 2> MulOps = {Op0, Op1};
+ auto *Mul = new VPWidenRecipe(
+ Instruction::Mul, make_range(MulOps.begin(), MulOps.end()),
+ MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap(),
+ MulAcc->getDebugLoc());
+ Mul->insertBefore(MulAcc);
+
+ auto *Red = new VPReductionRecipe(
+ MulAcc->getRecurrenceKind(), FastMathFlags(), MulAcc->getChainOp(), Mul,
+ MulAcc->getCondOp(), MulAcc->isOrdered(), MulAcc->getDebugLoc());
+ Red->insertBefore(MulAcc);
+
+ MulAcc->replaceAllUsesWith(Red);
+ MulAcc->eraseFromParent();
+}
+
void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
Type &CanonicalIVTy) {
using namespace llvm::VPlanPatternMatch;
@@ -2482,6 +2558,12 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
VPI->replaceAllUsesWith(VectorStep);
ToRemove.push_back(VPI);
}
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ if (auto *ExtRed = dyn_cast<VPExtendedReductionRecipe>(&R))
+ expandVPExtendedReduction(ExtRed);
+ if (auto *MulAcc = dyn_cast<VPMulAccumulateReductionRecipe>(&R))
+ expandVPMulAccumulateReduction(MulAcc);
+ }
}
for (VPRecipeBase *R : ToRemove)
@@ -2586,6 +2668,171 @@ void VPlanTransforms::handleUncountableEarlyExit(
LatchExitingBranch->eraseFromParent();
}
+/// This function tries convert extended in-loop reductions to
+/// VPExtendedReductionRecipe and clamp the \p Range if it is beneficial and
+/// valid. The created recipe must be lowered to concrete
+/// recipes before execution.
+static VPExtendedReductionRecipe *
+tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
+ VFRange &Range) {
+ using namespace VPlanPatternMatch;
+
+ Type *RedTy = Ctx.Types.inferScalarType(Red);
+ VPValue *VecOp = Red->getVecOp();
+
+ // Clamp the range if using extended-reduction is profitable.
+ auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
+ Type *SrcTy) -> bool {
+ return LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) {
+ auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
+ Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),
+ CostKind);
+ InstructionCost ExtCost =
+ cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
+ InstructionCost RedCost = Red->computeCost(VF, Ctx);
+ return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
+ },
+ Range);
+ };
+
+ VPValue *A;
+ // Match reduce(ext)).
+ if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
+ IsExtendedRedValidAndClampRange(
+ RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
+ cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
+ Instruction::CastOps::ZExt,
+ Ctx.Types.inferScalarType(A)))
+ return new VPExtendedReductionRecipe(Red, cast<VPWidenCastRecipe>(VecOp));
+
+ return nullptr;
+}
+
+/// This function tries convert extended in-loop reductions to
+/// VPMulAccumulateReductionRecipe and clamp the \p Range if it is beneficial
+/// and valid. The created VPExtendedReductionRecipe must be lower to concrete
+/// recipes before execution. Patterns of MulAccumulateReduction:
+/// reduce.add(mul(...)),
+/// reduce.add(mul(ext(A), ext(B))),
+/// reduce.add(ext(mul(ext(A), ext(B)))).
+static VPMulAccumulateReductionRecipe *
+tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
+ VPCostContext &Ctx, VFRange &Range) {
+ using namespace VPlanPatternMatch;
+
+ Type *RedTy = Ctx.Types.inferScalarType(Red);
+
+ // Clamp the range if using multiply-accumulate-reduction is profitable.
+ auto IsMulAccValidAndClampRange =
+ [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
+ VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
+ return LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) {
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ Type *SrcTy =
+ Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
+ auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
+ InstructionCost MulAccCost =
+ Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind);
+ InstructionCost MulCost = Mul->computeCost(VF, Ctx);
+ InstructionCost RedCost = Red->computeCost(VF, Ctx);
+ InstructionCost ExtCost = 0;
+ if (Ext0)
+ ExtCost += Ext0->computeCost(VF, Ctx);
+ if (Ext1)
+ ExtCost += Ext1->computeCost(VF, Ctx);
+ if (OuterExt)
+ ExtCost += OuterExt->computeCost(VF, Ctx);
+
+ return MulAccCost.isValid() &&
+ MulAccCost < ExtCost + MulCost + RedCost;
+ },
+ Range);
+ };
+
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
+ if (Opcode != Instruction::Add)
+ return nullptr;
+
+ VPValue *VecOp = Red->getVecOp();
+ VPValue *A, *B;
+ // Try to match reduce.add(mul(...))
+ if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
+ auto *RecipeA =
+ dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
+ auto *RecipeB =
+ dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
+ auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
+
+ // Match reduce.add(mul(ext, ext))
+ if (RecipeA && RecipeB &&
+ (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
+ match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
+ match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
+ IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
+ Instruction::CastOps::ZExt,
+ Mul, RecipeA, RecipeB, nullptr))
+ return new VPMulAccumulateReductionRecipe(Red, Mul, RecipeA, RecipeB,
+ RecipeA->getResultType());
+ // Match reduce.add(mul)
+ if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
+ return new VPMulAccumulateReductionRecipe(Red, Mul);
+ }
+ // Match reduce.add(ext(mul(ext(A), ext(B))))
+ // All extend recipes must have same opcode or A == B
+ // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
+ if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
+ m_ZExtOrSExt(m_VPValue()))))) {
+ auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
+ auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
+ auto *Ext0 =
+ cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
+ auto *Ext1 =
+ cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
+ if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
+ Ext0->getOpcode() == Ext1->getOpcode() &&
+ IsMulAccValidAndClampRange(Ext0->getOpcode() ==
+ Instruction::CastOps::ZExt,
+ Mul, Ext0, Ext1, Ext))
+ return new VPMulAccumulateReductionRecipe(Red, Mul, Ext0, Ext1,
+ Ext->getResultType());
+ }
+ return nullptr;
+}
+
+/// This function tries to create abstract recipes from the reduction recipe for
+/// following optimizations and cost estimation.
+static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
+ VPCostContext &Ctx,
+ VFRange &Range) {
+ VPReductionRecipe *AbstractR = nullptr;
+
+ if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
+ AbstractR = MulAcc;
+ else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
+ AbstractR = ExtRed;
+ // Cannot create abstract inloop reduction recipes.
+ if (!AbstractR)
+ return;
+
+ AbstractR->insertBefore(Red);
+ Red->replaceAllUsesWith(AbstractR);
+}
+
+void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
+ VFRange &Range) {
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
+ for (VPRecipeBase &R : *VPBB) {
+ if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
+ tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
+ }
+ }
+}
+
void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
if (Plan.hasScalarVFOnly())
return;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 64e28c286a962..97719a0763020 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -187,6 +187,13 @@ struct VPlanTransforms {
/// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);
+ /// This function converts initial recipes to the abstract recipes and clamps
+ /// \p Range based on cost model for following optimizations and cost
+ /// estimations. The converted abstract recipes will lower to concrete
+ /// recipes before codegen.
+ static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
+ VFRange &Range);
+
/// Perform instcombine-like simplifications on recipes in \p Plan. Use \p
/// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 638156eab7a84..64065edd315f9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -339,6 +339,8 @@ class VPDef {
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
+ VPMulAccumulateReductionSC,
+ VPExtendedReductionSC,
VPPartialReductionSC,
VPReplicateSC,
VPScalarIVStepsSC,
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
index 2078a10d04ce7..ce3b2a9f216f2 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
@@ -23,11 +23,11 @@ define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture rea
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
@@ -105,11 +105,11 @@ define i32 @mla_i8(ptr noalias nocapture readonly %A, ptr noalias nocapture read
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
index a11cc15a8a85b..d021306b89aab 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -646,12 +646,11 @@ define i64 @mla_i16_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i3
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64>
+; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <8 x i64> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -726,12 +725,11 @@ define i64 @mla_i8_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i32
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
-; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <8 x i32> [[TMP3]], [[TMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = zext nneg <8 x i32> [[TMP4]] to <8 x i64>
+; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-NEXT: [[TMP5:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -855,10 +853,10 @@ define i32 @mla_i16_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i3
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
@@ -910,10 +908,10 @@ define i32 @mla_i8_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i32
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <16 x i32> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
@@ -1016,10 +1014,10 @@ define signext i16 @mla_i8_i16(ptr nocapture readonly %x, ptr nocapture readonly
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16>
+; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw <16 x i16> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP4]], <16 x i16> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP5]])
@@ -1122,10 +1120,10 @@ define i32 @red_mla_ext_s8_s16_s32(ptr noalias nocapture readonly %A, ptr noalia
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP0]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
@@ -1459,9 +1457,8 @@ define i64 @mla_xx_sext_zext(ptr nocapture noundef readonly %x, i32 %n) #0 {
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -1528,11 +1525,11 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <8 x i64> [[TMP2]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI1]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -1667,24 +1664,55 @@ define i64 @test_std_q31(ptr %x, i32 %n) #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP11]])
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[WIDE_LOAD]], splat (i32 8)
+; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
+; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64>
+; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP5]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
+; CHECK-NEXT: [[TMP8]] = add i64 [[TMP7]], [[VEC_PHI1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP37:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD3:%.*]], [[ADD:%.*]]
+; CHECK-NEXT: [[ADD:%.*]] = phi i64 [ [[ADD1:%.*]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[ADD3:%.*]] = phi i64 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD3]], [[ADD]]
; CHECK-NEXT: ret i64 [[DIV]]
; CHECK: for.body:
-; CHECK-NEXT: [[S_014:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY1]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY1]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[T_012:%.*]] = phi i64 [ [[ADD3]], [[FOR_BODY1]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X:%.*]], i32 [[I_013]]
+; CHECK-NEXT: [[S_014:%.*]] = phi i64 [ [[ADD1]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[T_012:%.*]] = phi i64 [ [[ADD5]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i32 [[I_013]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP0]], 8
; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[SHR]] to i64
-; CHECK-NEXT: [[ADD]] = add nsw i64 [[S_014]], [[CONV]]
+; CHECK-NEXT: [[ADD1]] = add nsw i64 [[S_014]], [[CONV]]
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV]]
-; CHECK-NEXT: [[ADD3]] = add nuw nsw i64 [[MUL]], [[T_012]]
+; CHECK-NEXT: [[ADD5]] = add nuw nsw i64 [[MUL]], [[T_012]]
; CHECK-NEXT: [[ADD4]] = add nuw nsw i32 [[I_013]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[ADD4]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY1]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
;
entry:
%cmp11 = icmp sgt i32 %n, 0
@@ -1720,10 +1748,10 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[N]], -1
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 7
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 15
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -4
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -8
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC]], 1
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
@@ -1731,28 +1759,26 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 {
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
-; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i32> [[TMP7]] to <4 x i64>
-; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32>
-; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
-; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP13]]
-; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i32> [[TMP11]] to <4 x i64>
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP8]])
+; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2
+; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[STRIDED_VEC]] to <8 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <8 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP12]])
+; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = mul nsw <8 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP13]])
; CHECK-NEXT: [[TMP16]] = add i64 [[TMP15]], [[TMP10]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
@@ -1787,7 +1813,7 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 {
; CHECK-NEXT: [[ADD12]] = add nsw i64 [[ADD]], [[CONV11]]
; CHECK-NEXT: [[ADD13]] = add nuw nsw i32 [[I_025]], 2
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[ADD13]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP38:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP40:![0-9]+]]
;
entry:
%cmp23 = icmp sgt i32 %n, 0
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index 05f26b8a0a273..618044c9a188b 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -476,10 +476,10 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK: pred.load.continue8:
; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ]
; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ]
-; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP38]]
; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP41]])
; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP42]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP38]]
; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP44]])
; CHECK-NEXT: [[TMP46]] = add i32 [[TMP45]], [[TMP43]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index f6a1ebf8b0fe9..1ab498bee08be 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -225,9 +225,9 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND]])
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
; CHECK-NEXT: [[TMP6]] = add i32 [[TMP5]], [[TMP4]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -1289,15 +1289,13 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = udiv <4 x i8> [[WIDE_LOAD]], splat (i8 31)
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw <4 x i8> [[TMP2]], splat (i8 3)
; CHECK-NEXT: [[TMP4:%.*]] = udiv <4 x i8> [[TMP3]], splat (i8 31)
; CHECK-NEXT: [[TMP5:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]]
+; CHECK-NEXT: [[TMP9:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[TMP8]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 2cf630de208c9..cf920c91913fb 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -268,3 +268,148 @@ loop:
exit:
ret i64 %cond
}
+
+define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: 'print_extended_reduction'
+; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]>
+; CHECK-NEXT: EXTENDED-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> extended to i64)
+; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+ %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, ptr %x, i32 %iv
+ %load0 = load i32, ptr %arrayidx, align 4
+ %conv0 = zext i32 %load0 to i64
+ %rdx.next = add nsw i64 %rdx, %conv0
+ %iv.next = add nuw nsw i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
+ ret i64 %r.0.lcssa
+}
+
+define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: 'print_mulacc'
+; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
+; CHECK-NEXT: WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]>
+; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
+; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
+; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>)
+; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+ %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i64, ptr %x, i32 %iv
+ %load0 = load i64, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i64, ptr %y, i32 %iv
+ %load1 = load i64, ptr %arrayidx1, align 4
+ %mul = mul nsw i64 %load0, %load1
+ %rdx.next = add nsw i64 %rdx, %mul
+ %iv.next = add nuw nsw i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
+ ret i64 %r.0.lcssa
+}
+
+define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: 'print_mulacc_extended'
+; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
+; CHECK-NEXT: WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]>
+; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
+; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
+; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> extended to i64), (ir<[[LOAD1]]> extended to i64))
+; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+ %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i16, ptr %x, i32 %iv
+ %load0 = load i16, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i16, ptr %y, i32 %iv
+ %load1 = load i16, ptr %arrayidx1, align 4
+ %conv0 = sext i16 %load0 to i32
+ %conv1 = sext i16 %load1 to i32
+ %mul = mul nsw i32 %conv0, %conv1
+ %conv = sext i32 %mul to i64
+ %rdx.next = add nsw i64 %rdx, %conv
+ %iv.next = add nuw nsw i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
+ ret i64 %r.0.lcssa
+}
>From d35013cb7b986396a5649f16fe1ced87454404bd Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 4 May 2025 16:16:25 -0700
Subject: [PATCH 2/3] Fixup, Address comments.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 7c1b7948bdf0b..bd9778d6350bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2571,6 +2571,10 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
{R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(),
R->isOrdered(), Ext->getDebugLoc()),
ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) {
+ assert((ExtOp == Instruction::CastOps::ZExt ||
+ ExtOp == Instruction::CastOps::SExt) &&
+ "VPExtendedReductionRecipe only support zext and sext.");
+
// Not all WidenCastRecipes contain nneg flag. Need to transfer flags from
// the original recipe to prevent setting wrong flags.
transferFlags(*Ext);
@@ -2579,9 +2583,7 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
~VPExtendedReductionRecipe() override = default;
VPExtendedReductionRecipe *clone() override {
- auto *Copy = new VPExtendedReductionRecipe(this);
- Copy->transferFlags(*this);
- return Copy;
+ return new VPExtendedReductionRecipe(this);
}
VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC);
@@ -2651,6 +2653,9 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
Instruction::Add &&
"The reduction instruction in MulAccumulateteReductionRecipe must "
"be Add");
+ assert((ExtOp == Instruction::CastOps::ZExt ||
+ ExtOp == Instruction::CastOps::SExt) &&
+ "VPMulAccumulateReductionRecipe only support zext and sext.");
// Only set the non-negative flag if the original recipe contains.
if (Ext0->hasNonNegFlag())
IsNonNeg = Ext0->isNonNeg();
>From 2f703d5f5e1c3fb3df76b6b54eb531cdf307736a Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Mon, 5 May 2025 07:22:17 -0700
Subject: [PATCH 3/3] !fixup, Remove `computeCost()` for new recipes.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 11 ++----
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 39 ++++---------------
.../vplan-printing-reductions.ll | 12 +++---
3 files changed, 16 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bd9778d6350bb..533c90adf1c07 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2578,6 +2578,7 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
// Not all WidenCastRecipes contain nneg flag. Need to transfer flags from
// the original recipe to prevent setting wrong flags.
transferFlags(*Ext);
+ setUnderlyingValue(R->getUnderlyingValue());
}
~VPExtendedReductionRecipe() override = default;
@@ -2593,10 +2594,6 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
"VPExtendedRecipe + VPReductionRecipe before execution.");
};
- /// Return the cost of VPExtendedReductionRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
@@ -2656,6 +2653,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
assert((ExtOp == Instruction::CastOps::ZExt ||
ExtOp == Instruction::CastOps::SExt) &&
"VPMulAccumulateReductionRecipe only support zext and sext.");
+ setUnderlyingValue(R->getUnderlyingValue());
// Only set the non-negative flag if the original recipe contains.
if (Ext0->hasNonNegFlag())
IsNonNeg = Ext0->isNonNeg();
@@ -2673,6 +2671,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
Instruction::Add &&
"The reduction instruction in MulAccumulateReductionRecipe must be "
"Add");
+ setUnderlyingValue(R->getUnderlyingValue());
}
~VPMulAccumulateReductionRecipe() override = default;
@@ -2691,10 +2690,6 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
"VPWidenRecipe + VPReductionRecipe before execution");
}
- /// Return the cost of VPMulAccumulateReductionRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7fb5ea5cc4b93..8cc8635cf7413 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2486,8 +2486,6 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
FastMathFlags FMFs = getFastMathFlags();
- std::optional<FastMathFlags> OptionalFMF =
- ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
// TODO: Support any-of reductions.
assert(
@@ -2495,40 +2493,17 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
ForceTargetInstructionCost.getNumOccurrences() > 0) &&
"Any-of reduction not implemented in VPlan-based cost model currently.");
+ // Cost = Reduction cost + BinOp cost
+ InstructionCost Cost =
+ Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
- return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
+ return Cost +
+ Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
}
- return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
- Ctx.CostKind);
-}
-
-InstructionCost
-VPExtendedReductionRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
- unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind());
- Type *RedTy = Ctx.Types.inferScalarType(this);
- auto *SrcVecTy =
- cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF));
- assert(RedTy->isIntegerTy() &&
- "ExtendedReduction only support integer type currently.");
- InstructionCost Cost = Ctx.TTI.getExtendedReductionCost(
- Opcode, isZExt(), RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
- // The cost of this recipe should be decided by the legacy model.
- return Cost.isValid() ? 0 : Cost;
-}
-
-InstructionCost
-VPMulAccumulateReductionRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
- Type *RedTy = Ctx.Types.inferScalarType(this);
- auto *SrcVecTy =
- cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF));
- InstructionCost Cost =
- Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy, Ctx.CostKind);
- // The cost of this recipe should be decided by the legacy model.
- return Cost.isValid() ? 0 : Cost;
+ return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs,
+ Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index cf920c91913fb..307228220cb15 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -283,12 +283,12 @@ define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture re
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx>
; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]>
-; CHECK-NEXT: EXTENDED-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> extended to i64)
+; CHECK-NEXT: EXTENDED-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> extended to i64)
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
@@ -327,7 +327,7 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
@@ -335,7 +335,7 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i
; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
-; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>)
+; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>)
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
@@ -376,7 +376,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
@@ -384,7 +384,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado
; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
-; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> extended to i64), (ir<[[LOAD1]]> extended to i64))
+; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> extended to i64), (ir<[[LOAD1]]> extended to i64))
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
More information about the llvm-commits
mailing list