[llvm] 4eb30cf - [LV][EVL] Support in-loop reduction using tail folding with EVL. (#90184)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 16 01:15:29 PDT 2024
Author: Mel Chen
Date: 2024-07-16T16:15:24+08:00
New Revision: 4eb30cfb3474e3770b465cdb39db3b7f6404c3ef
URL: https://github.com/llvm/llvm-project/commit/4eb30cfb3474e3770b465cdb39db3b7f6404c3ef
DIFF: https://github.com/llvm/llvm-project/commit/4eb30cfb3474e3770b465cdb39db3b7f6404c3ef.diff
LOG: [LV][EVL] Support in-loop reduction using tail folding with EVL. (#90184)
Following from #87816, add VPReductionEVLRecipe to describe vector
predication reduction.
Address one of TODOs from #76172.
Added:
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
Modified:
llvm/include/llvm/IR/VectorBuilder.h
llvm/include/llvm/Transforms/Utils/LoopUtils.h
llvm/lib/IR/VectorBuilder.cpp
llvm/lib/Transforms/Utils/LoopUtils.cpp
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlan.h
llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
llvm/lib/Transforms/Vectorize/VPlanValue.h
llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h
index 301edaed70fe8..6af7f6075551d 100644
--- a/llvm/include/llvm/IR/VectorBuilder.h
+++ b/llvm/include/llvm/IR/VectorBuilder.h
@@ -15,6 +15,7 @@
#ifndef LLVM_IR_VECTORBUILDER_H
#define LLVM_IR_VECTORBUILDER_H
+#include <llvm/Analysis/IVDescriptors.h>
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/InstrTypes.h>
#include <llvm/IR/Instruction.h>
@@ -57,6 +58,11 @@ class VectorBuilder {
return RetType();
}
+ /// Helper function for creating VP intrinsic call.
+ Value *createVectorInstructionImpl(Intrinsic::ID VPID, Type *ReturnTy,
+ ArrayRef<Value *> VecOpArray,
+ const Twine &Name = Twine());
+
public:
VectorBuilder(IRBuilderBase &Builder,
Behavior ErrorHandling = Behavior::ReportAndAbort)
@@ -92,6 +98,15 @@ class VectorBuilder {
Value *createVectorInstruction(unsigned Opcode, Type *ReturnTy,
ArrayRef<Value *> VecOpArray,
const Twine &Name = Twine());
+
+ /// Emit a VP reduction intrinsic call for recurrence kind.
+ /// \param Kind The kind of recurrence
+ /// \param ValTy The type of operand which the reduction operation is
+ /// performed.
+ /// \param VecOpArray The operand list.
+ Value *createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
+ ArrayRef<Value *> VecOpArray,
+ const Twine &Name = Twine());
};
} // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 345e09dce0b2b..1a878126aa082 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -15,6 +15,7 @@
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/IR/VectorBuilder.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
namespace llvm {
@@ -394,6 +395,10 @@ Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op,
/// Fast-math-flags are propagated using the IRBuilder's setting.
Value *createSimpleTargetReduction(IRBuilderBase &B, Value *Src,
RecurKind RdxKind);
+/// Overloaded function to generate vector-predication intrinsics for target
+/// reduction.
+Value *createSimpleTargetReduction(VectorBuilder &VB, Value *Src,
+ const RecurrenceDescriptor &Desc);
/// Create a target reduction of the given vector \p Src for a reduction of the
/// kind RecurKind::IAnyOf or RecurKind::FAnyOf. The reduction operation is
@@ -414,6 +419,11 @@ Value *createTargetReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc,
Value *createOrderedReduction(IRBuilderBase &B,
const RecurrenceDescriptor &Desc, Value *Src,
Value *Start);
+/// Overloaded function to generate vector-predication intrinsics for ordered
+/// reduction.
+Value *createOrderedReduction(VectorBuilder &VB,
+ const RecurrenceDescriptor &Desc, Value *Src,
+ Value *Start);
/// Get the intersection (logical and) of all of the potential IR flags
/// of each scalar operation (VL) that will be converted into a vector (I).
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
index c07bc0561fba9..5ff3082879895 100644
--- a/llvm/lib/IR/VectorBuilder.cpp
+++ b/llvm/lib/IR/VectorBuilder.cpp
@@ -57,7 +57,70 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
auto VPID = VPIntrinsic::getForOpcode(Opcode);
if (VPID == Intrinsic::not_intrinsic)
return returnWithError<Value *>("No VPIntrinsic for this opcode");
+ return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name);
+}
+
+Value *VectorBuilder::createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
+ ArrayRef<Value *> InstOpArray,
+ const Twine &Name) {
+ Intrinsic::ID VPID;
+ switch (Kind) {
+ case RecurKind::Add:
+ VPID = Intrinsic::vp_reduce_add;
+ break;
+ case RecurKind::Mul:
+ VPID = Intrinsic::vp_reduce_mul;
+ break;
+ case RecurKind::And:
+ VPID = Intrinsic::vp_reduce_and;
+ break;
+ case RecurKind::Or:
+ VPID = Intrinsic::vp_reduce_or;
+ break;
+ case RecurKind::Xor:
+ VPID = Intrinsic::vp_reduce_xor;
+ break;
+ case RecurKind::FMulAdd:
+ case RecurKind::FAdd:
+ VPID = Intrinsic::vp_reduce_fadd;
+ break;
+ case RecurKind::FMul:
+ VPID = Intrinsic::vp_reduce_fmul;
+ break;
+ case RecurKind::SMax:
+ VPID = Intrinsic::vp_reduce_smax;
+ break;
+ case RecurKind::SMin:
+ VPID = Intrinsic::vp_reduce_smin;
+ break;
+ case RecurKind::UMax:
+ VPID = Intrinsic::vp_reduce_umax;
+ break;
+ case RecurKind::UMin:
+ VPID = Intrinsic::vp_reduce_umin;
+ break;
+ case RecurKind::FMax:
+ VPID = Intrinsic::vp_reduce_fmax;
+ break;
+ case RecurKind::FMin:
+ VPID = Intrinsic::vp_reduce_fmin;
+ break;
+ case RecurKind::FMaximum:
+ VPID = Intrinsic::vp_reduce_fmaximum;
+ break;
+ case RecurKind::FMinimum:
+ VPID = Intrinsic::vp_reduce_fminimum;
+ break;
+ default:
+ llvm_unreachable("No VPIntrinsic for this reduction");
+ }
+ return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name);
+}
+Value *VectorBuilder::createVectorInstructionImpl(Intrinsic::ID VPID,
+ Type *ReturnTy,
+ ArrayRef<Value *> InstOpArray,
+ const Twine &Name) {
auto MaskPosOpt = VPIntrinsic::getMaskParamPos(VPID);
auto VLenPosOpt = VPIntrinsic::getVectorLengthParamPos(VPID);
size_t NumInstParams = InstOpArray.size();
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index a127a3265758d..ff93035ce0652 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1192,6 +1192,19 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
}
}
+Value *llvm::createSimpleTargetReduction(VectorBuilder &VBuilder, Value *Src,
+ const RecurrenceDescriptor &Desc) {
+ RecurKind Kind = Desc.getRecurrenceKind();
+ assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
+ "AnyOf reduction is not supported.");
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ Type *SrcEltTy = SrcTy->getElementType();
+ Value *Iden =
+ Desc.getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags());
+ Value *Ops[] = {Iden, Src};
+ return VBuilder.createSimpleTargetReduction(Kind, SrcTy, Ops);
+}
+
Value *llvm::createTargetReduction(IRBuilderBase &B,
const RecurrenceDescriptor &Desc, Value *Src,
PHINode *OrigPhi) {
@@ -1220,6 +1233,20 @@ Value *llvm::createOrderedReduction(IRBuilderBase &B,
return B.CreateFAddReduce(Start, Src);
}
+Value *llvm::createOrderedReduction(VectorBuilder &VBuilder,
+ const RecurrenceDescriptor &Desc,
+ Value *Src, Value *Start) {
+ assert((Desc.getRecurrenceKind() == RecurKind::FAdd ||
+ Desc.getRecurrenceKind() == RecurKind::FMulAdd) &&
+ "Unexpected reduction kind");
+ assert(Src->getType()->isVectorTy() && "Expected a vector type");
+ assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
+
+ auto *SrcTy = cast<VectorType>(Src->getType());
+ Value *Ops[] = {Start, Src};
+ return VBuilder.createSimpleTargetReduction(RecurKind::FAdd, SrcTy, Ops);
+}
+
void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
bool IncludeWrapFlags) {
auto *VecOp = dyn_cast<Instruction>(I);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5520baef7152d..4eb88fe32a820 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1516,9 +1516,7 @@ class LoopVectorizationCostModel {
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
!EnableVPlanNativePath &&
// FIXME: implement support for max safe dependency distance.
- Legal->isSafeForAnyVectorWidth() &&
- // FIXME: remove this once reductions are supported.
- Legal->getReductionVars().empty();
+ Legal->isSafeForAnyVectorWidth();
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e3b78700e1018..805d9d91fc186 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -909,6 +909,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPEVLBasedIVPHISC:
case VPRecipeBase::VPExpandSCEVSC:
case VPRecipeBase::VPInstructionSC:
+ case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
case VPRecipeBase::VPReplicateSC:
case VPRecipeBase::VPScalarIVStepsSC:
@@ -2171,17 +2172,27 @@ class VPReductionRecipe : public VPSingleDefRecipe {
/// The recurrence decriptor for the reduction in question.
const RecurrenceDescriptor &RdxDesc;
bool IsOrdered;
+ /// Whether the reduction is conditional.
+ bool IsConditional = false;
+
+protected:
+ VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R,
+ Instruction *I, ArrayRef<VPValue *> Operands,
+ VPValue *CondOp, bool IsOrdered)
+ : VPSingleDefRecipe(SC, Operands, I), RdxDesc(R), IsOrdered(IsOrdered) {
+ if (CondOp) {
+ IsConditional = true;
+ addOperand(CondOp);
+ }
+ }
public:
VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
bool IsOrdered)
- : VPSingleDefRecipe(VPDef::VPReductionSC,
- ArrayRef<VPValue *>({ChainOp, VecOp}), I),
- RdxDesc(R), IsOrdered(IsOrdered) {
- if (CondOp)
- addOperand(CondOp);
- }
+ : VPReductionRecipe(VPDef::VPReductionSC, R, I,
+ ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
+ IsOrdered) {}
~VPReductionRecipe() override = default;
@@ -2190,7 +2201,15 @@ class VPReductionRecipe : public VPSingleDefRecipe {
getVecOp(), getCondOp(), IsOrdered);
}
- VP_CLASSOF_IMPL(VPDef::VPReductionSC)
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
+ R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
+ }
+
+ static inline bool classof(const VPUser *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && classof(R);
+ }
/// Generate the reduction in the loop
void execute(VPTransformState &State) override;
@@ -2201,13 +2220,62 @@ class VPReductionRecipe : public VPSingleDefRecipe {
VPSlotTracker &SlotTracker) const override;
#endif
+ /// Return the recurrence decriptor for the in-loop reduction.
+ const RecurrenceDescriptor &getRecurrenceDescriptor() const {
+ return RdxDesc;
+ }
+ /// Return true if the in-loop reduction is ordered.
+ bool isOrdered() const { return IsOrdered; };
+ /// Return true if the in-loop reduction is conditional.
+ bool isConditional() const { return IsConditional; };
/// The VPValue of the scalar Chain being accumulated.
VPValue *getChainOp() const { return getOperand(0); }
/// The VPValue of the vector value to be reduced.
VPValue *getVecOp() const { return getOperand(1); }
/// The VPValue of the condition for the block.
VPValue *getCondOp() const {
- return getNumOperands() > 2 ? getOperand(2) : nullptr;
+ return isConditional() ? getOperand(getNumOperands() - 1) : nullptr;
+ }
+};
+
+/// A recipe to represent inloop reduction operations with vector-predication
+/// intrinsics, performing a reduction on a vector operand with the explicit
+/// vector length (EVL) into a scalar value, and adding the result to a chain.
+/// The Operands are {ChainOp, VecOp, EVL, [Condition]}.
+class VPReductionEVLRecipe : public VPReductionRecipe {
+public:
+ VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL, VPValue *CondOp)
+ : VPReductionRecipe(
+ VPDef::VPReductionEVLSC, R->getRecurrenceDescriptor(),
+ cast_or_null<Instruction>(R->getUnderlyingValue()),
+ ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}), CondOp,
+ R->isOrdered()) {}
+
+ ~VPReductionEVLRecipe() override = default;
+
+ VPReductionEVLRecipe *clone() override {
+ llvm_unreachable("cloning not implemented yet");
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPReductionEVLSC)
+
+ /// Generate the reduction in the loop
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// The VPValue of the explicit vector length.
+ VPValue *getEVL() const { return getOperand(2); }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getEVL();
}
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 36d52b255232a..6d89ad9fee8ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -274,6 +274,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
[](const VPScalarCastRecipe *R) { return R->getResultType(); })
.Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
return R->getSCEV()->getType();
+ })
+ .Case<VPReductionRecipe>([this](const auto *R) {
+ return inferScalarType(R->getChainOp());
});
assert(ResultTy && "could not infer type for the given VPValue");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 53d91ee27b73f..4b1ac79bbfdd4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -63,6 +63,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPPredInstPHISC:
return false;
case VPBlendSC:
+ case VPReductionEVLSC:
case VPReductionSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -104,6 +105,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenStoreSC:
return false;
case VPBlendSC:
+ case VPReductionEVLSC:
case VPReductionSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -151,6 +153,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
}
case VPBlendSC:
+ case VPReductionEVLSC:
case VPReductionSC:
case VPScalarIVStepsSC:
case VPWidenCanonicalIVSC:
@@ -1744,6 +1747,46 @@ void VPReductionRecipe::execute(VPTransformState &State) {
}
}
+void VPReductionEVLRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Reduction being replicated.");
+ assert(State.UF == 1 &&
+ "Expected only UF == 1 when vectorizing with explicit vector length.");
+
+ auto &Builder = State.Builder;
+ // Propagate the fast-math flags carried by the underlying instruction.
+ IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
+ const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
+ Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
+
+ RecurKind Kind = RdxDesc.getRecurrenceKind();
+ Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);
+ Value *VecOp = State.get(getVecOp(), 0);
+ Value *EVL = State.get(getEVL(), VPIteration(0, 0));
+
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL);
+ Value *Mask;
+ // TODO: move the all-true mask generation into VectorBuilder.
+ if (VPValue *CondOp = getCondOp())
+ Mask = State.get(CondOp, 0);
+ else
+ Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+ VBuilder.setMask(Mask);
+
+ Value *NewRed;
+ if (isOrdered()) {
+ NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
+ } else {
+ NewRed = createSimpleTargetReduction(VBuilder, VecOp, RdxDesc);
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
+ NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
+ else
+ NewRed = Builder.CreateBinOp(
+ (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
+ }
+ State.set(this, NewRed, 0, /*IsScalar*/ true);
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
@@ -1756,7 +1799,31 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
O << getUnderlyingInstr()->getFastMathFlags();
O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
getVecOp()->printAsOperand(O, SlotTracker);
- if (getCondOp()) {
+ if (isConditional()) {
+ O << ", ";
+ getCondOp()->printAsOperand(O, SlotTracker);
+ }
+ O << ")";
+ if (RdxDesc.IntermediateStore)
+ O << " (with final reduction value stored in invariant address sank "
+ "outside of loop)";
+}
+
+void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
+ O << Indent << "REDUCE ";
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ getChainOp()->printAsOperand(O, SlotTracker);
+ O << " +";
+ if (isa<FPMathOperator>(getUnderlyingInstr()))
+ O << getUnderlyingInstr()->getFastMathFlags();
+ O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
+ getVecOp()->printAsOperand(O, SlotTracker);
+ O << ", ";
+ getEVL()->printAsOperand(O, SlotTracker);
+ if (isConditional()) {
O << ", ";
getCondOp()->printAsOperand(O, SlotTracker);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b3396a0a46af4..d668ae2aa5c08 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1427,11 +1427,20 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
// The transform updates all users of inductions to work based on EVL, instead
// of the VF directly. At the moment, widened inductions cannot be updated, so
// bail out if the plan contains any.
- if (any_of(Header->phis(), [](VPRecipeBase &Phi) {
- return (isa<VPWidenIntOrFpInductionRecipe>(&Phi) ||
- isa<VPWidenPointerInductionRecipe>(&Phi));
- }))
+ bool ContainsWidenInductions = any_of(Header->phis(), [](VPRecipeBase &Phi) {
+ return isa<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(
+ &Phi);
+ });
+ // FIXME: Remove this once we can transform (select header_mask, true_value,
+ // false_value) into vp.merge.
+ bool ContainsOutloopReductions =
+ any_of(Header->phis(), [&](VPRecipeBase &Phi) {
+ auto *R = dyn_cast<VPReductionPHIRecipe>(&Phi);
+ return R && !R->isInLoop();
+ });
+ if (ContainsWidenInductions || ContainsOutloopReductions)
return false;
+
auto *CanonicalIVPHI = Plan.getCanonicalIV();
VPValue *StartV = CanonicalIVPHI->getStartValue();
@@ -1462,23 +1471,42 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
- auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U);
- if (!MemR)
+ VPRecipeBase *NewRecipe = nullptr;
+ auto *CurRecipe = dyn_cast<VPRecipeBase>(U);
+ if (!CurRecipe)
continue;
- VPValue *OrigMask = MemR->getMask();
- assert(OrigMask && "Unmasked widen memory recipe when folding tail");
- VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask;
- if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR)) {
- auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
- N->insertBefore(L);
- L->replaceAllUsesWith(N);
- L->eraseFromParent();
- } else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR)) {
- auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
- N->insertBefore(S);
- S->eraseFromParent();
- } else {
- llvm_unreachable("unsupported recipe");
+
+ auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
+ assert(OrigMask && "Unmasked recipe when folding tail");
+ return HeaderMask == OrigMask ? nullptr : OrigMask;
+ };
+ if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(CurRecipe)) {
+ VPValue *NewMask = GetNewMask(MemR->getMask());
+ if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR))
+ NewRecipe = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
+ else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR))
+ NewRecipe = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
+ else
+ llvm_unreachable("unsupported recipe");
+ } else if (auto *RedR = dyn_cast<VPReductionRecipe>(CurRecipe)) {
+ NewRecipe = new VPReductionEVLRecipe(RedR, VPEVL,
+ GetNewMask(RedR->getCondOp()));
+ }
+
+ if (NewRecipe) {
+ [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues();
+ assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
+ "New recipe must define the same number of values as the "
+ "original.");
+ assert(
+ NumDefVal <= 1 &&
+ "Only supports recipes with a single definition or without users.");
+ NewRecipe->insertBefore(CurRecipe);
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(NewRecipe)) {
+ VPValue *CurVPV = CurRecipe->getVPSingleValue();
+ CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
+ }
+ CurRecipe->eraseFromParent();
}
}
recursivelyDeleteDeadRecipes(HeaderMask);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index fa6a65ff2f3ad..452c977106a77 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -341,6 +341,7 @@ class VPDef {
VPExpandSCEVSC,
VPInstructionSC,
VPInterleaveSC,
+ VPReductionEVLSC,
VPReductionSC,
VPReplicateSC,
VPScalarCastSC,
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 4e26eb38f21a9..8824fa8a16b74 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize < %s -S -o - | FileCheck %s -check-prefix=OUTLOOP
; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize -prefer-inloop-reductions < %s -S -o - | FileCheck %s -check-prefix=INLOOP
-; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL-OUTLOOP %s
+; RUN: opt -passes=loop-vectorize -prefer-inloop-reductions -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL-INLOOP %s
+
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
target triple = "riscv64"
-; FIXME: inloop reductions are not supported yet with predicated vectorization.
-
define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; OUTLOOP-LABEL: @add_i16_i32(
; OUTLOOP-NEXT: entry:
@@ -117,67 +117,84 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; INLOOP-NEXT: ret i32 [[R_0_LCSSA]]
;
-; IF-EVL-LABEL: @add_i16_i32(
-; IF-EVL-NEXT: entry:
-; IF-EVL-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; IF-EVL-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; IF-EVL: for.body.preheader:
-; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4
-; IF-EVL-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 1
-; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP2]]
-; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
-; IF-EVL-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; IF-EVL: vector.body:
-; IF-EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INDEX]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; IF-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP6]]
-; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP7]]
-; IF-EVL-NEXT: [[TMP8:%.*]] = icmp ule <vscale x 4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
-; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP5]]
-; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0
-; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP10]], i32 2, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i16> poison)
-; IF-EVL-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
-; IF-EVL-NEXT: [[TMP12]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP11]]
-; IF-EVL-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[VEC_PHI]]
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
-; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; IF-EVL: middle.block:
-; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; IF-EVL: scalar.ph:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; IF-EVL: for.body:
-; IF-EVL-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; IF-EVL-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; IF-EVL-NEXT: [[CONV:%.*]] = sext i16 [[TMP16]] to i32
-; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
-; IF-EVL-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
-; IF-EVL-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; IF-EVL: for.cond.cleanup.loopexit:
-; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; IF-EVL: for.cond.cleanup:
-; IF-EVL-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; IF-EVL-NEXT: ret i32 [[R_0_LCSSA]]
+; IF-EVL-OUTLOOP-LABEL: @add_i16_i32(
+; IF-EVL-OUTLOOP-NEXT: entry:
+; IF-EVL-OUTLOOP-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; IF-EVL-OUTLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; IF-EVL-OUTLOOP: for.body.preheader:
+; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: for.body:
+; IF-EVL-OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[I_08]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
+; IF-EVL-OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
+; IF-EVL-OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; IF-EVL-OUTLOOP: for.cond.cleanup.loopexit:
+; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
+; IF-EVL-OUTLOOP: for.cond.cleanup:
+; IF-EVL-OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; IF-EVL-OUTLOOP-NEXT: ret i32 [[R_0_LCSSA]]
+;
+; IF-EVL-INLOOP-LABEL: @add_i16_i32(
+; IF-EVL-INLOOP-NEXT: entry:
+; IF-EVL-INLOOP-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; IF-EVL-INLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; IF-EVL-INLOOP: for.body.preheader:
+; IF-EVL-INLOOP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-INLOOP: vector.ph:
+; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 8
+; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 1
+; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP2]]
+; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-INLOOP-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-INLOOP-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 8
+; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-INLOOP: vector.body:
+; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 8, i1 true)
+; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = add i32 [[EVL_BASED_IV]], 0
+; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
+; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
+; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP9]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = sext <vscale x 8 x i16> [[VP_OP_LOAD]] to <vscale x 8 x i32>
+; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> [[TMP10]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-INLOOP-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]]
+; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
+; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-INLOOP: middle.block:
+; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL-INLOOP: scalar.ph:
+; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL-INLOOP: for.body:
+; IF-EVL-INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; IF-EVL-INLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
+; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32
+; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
+; IF-EVL-INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
+; IF-EVL-INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL-INLOOP: for.cond.cleanup.loopexit:
+; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
+; IF-EVL-INLOOP: for.cond.cleanup:
+; IF-EVL-INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; IF-EVL-INLOOP-NEXT: ret i32 [[R_0_LCSSA]]
;
entry:
%cmp6 = icmp sgt i32 %n, 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
new file mode 100644
index 0000000000000..8bde5ba5f1519
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -0,0 +1,891 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=IF-EVL-OUTLOOP
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-inloop-reductions \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=IF-EVL-INLOOP
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=NO-VP-OUTLOOP
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-inloop-reductions \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=NO-VP-INLOOP
+
+define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-OUTLOOP-LABEL: define i32 @cond_add(
+; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] {
+; IF-EVL-OUTLOOP-NEXT: entry:
+; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: for.body:
+; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP27]], 3
+; IF-EVL-OUTLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP27]], i32 0
+; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]]
+; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1
+; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-OUTLOOP: for.end:
+; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]]
+;
+; IF-EVL-INLOOP-LABEL: define i32 @cond_add(
+; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] {
+; IF-EVL-INLOOP-NEXT: entry:
+; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]]
+; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-INLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-INLOOP: vector.ph:
+; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-INLOOP: vector.body:
+; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
+; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
+; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-INLOOP: middle.block:
+; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL-INLOOP: scalar.ph:
+; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL-INLOOP: for.body:
+; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP25]], 3
+; IF-EVL-INLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP25]], i32 0
+; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]]
+; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL-INLOOP: for.end:
+; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-INLOOP-NEXT: ret i32 [[ADD_LCSSA]]
+;
+; NO-VP-OUTLOOP-LABEL: define i32 @cond_add(
+; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-VP-OUTLOOP-NEXT: entry:
+; NO-VP-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP-OUTLOOP: vector.ph:
+; NO-VP-OUTLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
+; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
+; NO-VP-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP-OUTLOOP: vector.body:
+; NO-VP-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP11]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
+; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-OUTLOOP-NEXT: [[TMP21:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
+; NO-VP-OUTLOOP-NEXT: [[TMP17]] = add <vscale x 4 x i32> [[TMP16]], [[VEC_PHI]]
+; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; NO-VP-OUTLOOP-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP-OUTLOOP: middle.block:
+; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP17]])
+; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP-OUTLOOP: scalar.ph:
+; NO-VP-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP-OUTLOOP: for.body:
+; NO-VP-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-OUTLOOP-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP15]], 3
+; NO-VP-OUTLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP15]], i32 0
+; NO-VP-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]]
+; NO-VP-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP-OUTLOOP: for.end:
+; NO-VP-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; NO-VP-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]]
+;
+; NO-VP-INLOOP-LABEL: define i32 @cond_add(
+; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-VP-INLOOP-NEXT: entry:
+; NO-VP-INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; NO-VP-INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP-INLOOP: vector.ph:
+; NO-VP-INLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP-INLOOP: vector.body:
+; NO-VP-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
+; NO-VP-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-INLOOP-NEXT: [[TMP9:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-INLOOP-NEXT: [[TMP10:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> zeroinitializer
+; NO-VP-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; NO-VP-INLOOP-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]]
+; NO-VP-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP-INLOOP: middle.block:
+; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP-INLOOP: scalar.ph:
+; NO-VP-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP-INLOOP: for.body:
+; NO-VP-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-INLOOP-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 3
+; NO-VP-INLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP14]], i32 0
+; NO-VP-INLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]]
+; NO-VP-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP-INLOOP: for.end:
+; NO-VP-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-INLOOP-NEXT: ret i32 [[ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp = icmp sgt i32 %0, 3
+ %select = select i1 %cmp, i32 %0, i32 0
+ %add = add nsw i32 %select, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %add
+}
+
+define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-OUTLOOP-LABEL: define i32 @cond_add_pred(
+; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; IF-EVL-OUTLOOP-NEXT: entry:
+; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: for.body:
+; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP27:%.*]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP28]], 3
+; IF-EVL-OUTLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[MIDDLE_BLOCK]]
+; IF-EVL-OUTLOOP: if.then:
+; IF-EVL-OUTLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[BC_MERGE_RDX]], [[TMP28]]
+; IF-EVL-OUTLOOP-NEXT: br label [[MIDDLE_BLOCK]]
+; IF-EVL-OUTLOOP: for.inc:
+; IF-EVL-OUTLOOP-NEXT: [[TMP27]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL-OUTLOOP: for.end:
+; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]]
+;
+; IF-EVL-INLOOP-LABEL: define i32 @cond_add_pred(
+; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; IF-EVL-INLOOP-NEXT: entry:
+; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]]
+; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-INLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-INLOOP: vector.ph:
+; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-INLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-INLOOP: vector.body:
+; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-INLOOP-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
+; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
+; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-INLOOP: middle.block:
+; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL-INLOOP: scalar.ph:
+; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL-INLOOP: for.body:
+; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; IF-EVL-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ]
+; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP25]], 3
+; IF-EVL-INLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; IF-EVL-INLOOP: if.then:
+; IF-EVL-INLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP25]]
+; IF-EVL-INLOOP-NEXT: br label [[FOR_INC]]
+; IF-EVL-INLOOP: for.inc:
+; IF-EVL-INLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-INLOOP: for.end:
+; IF-EVL-INLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-INLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]]
+;
+; NO-VP-OUTLOOP-LABEL: define i32 @cond_add_pred(
+; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; NO-VP-OUTLOOP-NEXT: entry:
+; NO-VP-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP-OUTLOOP: vector.ph:
+; NO-VP-OUTLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
+; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
+; NO-VP-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP-OUTLOOP: vector.body:
+; NO-VP-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP11]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
+; NO-VP-OUTLOOP-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
+; NO-VP-OUTLOOP-NEXT: [[TMP22:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
+; NO-VP-OUTLOOP-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP22]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP16]]
+; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP-OUTLOOP: middle.block:
+; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PREDPHI]])
+; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP-OUTLOOP: scalar.ph:
+; NO-VP-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP-OUTLOOP: for.body:
+; NO-VP-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; NO-VP-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ]
+; NO-VP-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-OUTLOOP-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP15]], 3
+; NO-VP-OUTLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; NO-VP-OUTLOOP: if.then:
+; NO-VP-OUTLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP15]]
+; NO-VP-OUTLOOP-NEXT: br label [[FOR_INC]]
+; NO-VP-OUTLOOP: for.inc:
+; NO-VP-OUTLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-VP-OUTLOOP: for.end:
+; NO-VP-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; NO-VP-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]]
+;
+; NO-VP-INLOOP-LABEL: define i32 @cond_add_pred(
+; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; NO-VP-INLOOP-NEXT: entry:
+; NO-VP-INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; NO-VP-INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP-INLOOP: vector.ph:
+; NO-VP-INLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP-INLOOP: vector.body:
+; NO-VP-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
+; NO-VP-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-INLOOP-NEXT: [[TMP9:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-INLOOP-NEXT: [[TMP10:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> zeroinitializer
+; NO-VP-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; NO-VP-INLOOP-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]]
+; NO-VP-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP-INLOOP: middle.block:
+; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP-INLOOP: scalar.ph:
+; NO-VP-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP-INLOOP: for.body:
+; NO-VP-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; NO-VP-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ]
+; NO-VP-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-INLOOP-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 3
+; NO-VP-INLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; NO-VP-INLOOP: if.then:
+; NO-VP-INLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP14]]
+; NO-VP-INLOOP-NEXT: br label [[FOR_INC]]
+; NO-VP-INLOOP: for.inc:
+; NO-VP-INLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-VP-INLOOP: for.end:
+; NO-VP-INLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-INLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+ %rdx = phi i32 [ %start, %entry ], [ %rdx.add, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp = icmp sgt i32 %0, 3
+ br i1 %cmp, label %if.then, label %for.inc
+
+if.then:
+ %add.pred = add nsw i32 %rdx, %0
+ br label %for.inc
+
+for.inc:
+ %rdx.add = phi i32 [ %add.pred, %if.then ], [ %rdx, %for.body ]
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %rdx.add
+}
+
+define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-OUTLOOP-LABEL: define i32 @step_cond_add(
+; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; IF-EVL-OUTLOOP-NEXT: entry:
+; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: for.body:
+; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-OUTLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
+; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP37]], [[IV_TRUNC]]
+; IF-EVL-OUTLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP37]], i32 0
+; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]]
+; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1
+; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL-OUTLOOP: for.end:
+; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]]
+;
+; IF-EVL-INLOOP-LABEL: define i32 @step_cond_add(
+; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; IF-EVL-INLOOP-NEXT: entry:
+; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-INLOOP: for.body:
+; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-INLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
+; IF-EVL-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP28]], [[IV_TRUNC]]
+; IF-EVL-INLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP28]], i32 0
+; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]]
+; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1
+; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-INLOOP: for.end:
+; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: ret i32 [[ADD_LCSSA]]
+;
+; NO-VP-OUTLOOP-LABEL: define i32 @step_cond_add(
+; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; NO-VP-OUTLOOP-NEXT: entry:
+; NO-VP-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP-OUTLOOP: vector.ph:
+; NO-VP-OUTLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
+; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
+; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP12]], zeroinitializer
+; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = mul <vscale x 4 x i32> [[TMP13]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP14]]
+; NO-VP-OUTLOOP-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4
+; NO-VP-OUTLOOP-NEXT: [[TMP17:%.*]] = mul i32 1, [[TMP16]]
+; NO-VP-OUTLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP17]], i64 0
+; NO-VP-OUTLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP-OUTLOOP: vector.body:
+; NO-VP-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP11]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-OUTLOOP-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
+; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP20]], align 4
+; NO-VP-OUTLOOP-NEXT: [[TMP27:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_IND]]
+; NO-VP-OUTLOOP-NEXT: [[TMP22:%.*]] = select <vscale x 4 x i1> [[TMP27]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
+; NO-VP-OUTLOOP-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI]]
+; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; NO-VP-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; NO-VP-OUTLOOP-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-VP-OUTLOOP: middle.block:
+; NO-VP-OUTLOOP-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP23]])
+; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP-OUTLOOP: scalar.ph:
+; NO-VP-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP26]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP-OUTLOOP: for.body:
+; NO-VP-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-OUTLOOP-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-OUTLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
+; NO-VP-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP21]], [[IV_TRUNC]]
+; NO-VP-OUTLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP21]], i32 0
+; NO-VP-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]]
+; NO-VP-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NO-VP-OUTLOOP: for.end:
+; NO-VP-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; NO-VP-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]]
+;
+; NO-VP-INLOOP-LABEL: define i32 @step_cond_add(
+; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; NO-VP-INLOOP-NEXT: entry:
+; NO-VP-INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; NO-VP-INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP-INLOOP: vector.ph:
+; NO-VP-INLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-INLOOP-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; NO-VP-INLOOP-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i32> [[TMP6]], zeroinitializer
+; NO-VP-INLOOP-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-INLOOP-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; NO-VP-INLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-INLOOP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; NO-VP-INLOOP-NEXT: [[TMP11:%.*]] = mul i32 1, [[TMP10]]
+; NO-VP-INLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
+; NO-VP-INLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP-INLOOP: vector.body:
+; NO-VP-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-INLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
+; NO-VP-INLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-VP-INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-INLOOP-NEXT: [[TMP15:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
+; NO-VP-INLOOP-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> zeroinitializer
+; NO-VP-INLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; NO-VP-INLOOP-NEXT: [[TMP18]] = add i32 [[TMP17]], [[VEC_PHI]]
+; NO-VP-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-INLOOP-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; NO-VP-INLOOP-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-INLOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-VP-INLOOP: middle.block:
+; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP-INLOOP: scalar.ph:
+; NO-VP-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP-INLOOP: for.body:
+; NO-VP-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-INLOOP-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-INLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
+; NO-VP-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP20]], [[IV_TRUNC]]
+; NO-VP-INLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP20]], i32 0
+; NO-VP-INLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]]
+; NO-VP-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NO-VP-INLOOP: for.end:
+; NO-VP-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; NO-VP-INLOOP-NEXT: ret i32 [[ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %iv.trunc = trunc i64 %iv to i32
+ %cmp = icmp sgt i32 %0, %iv.trunc
+ %select = select i1 %cmp, i32 %0, i32 0
+ %add = add nsw i32 %select, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %add
+}
+
+define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-OUTLOOP-LABEL: define i32 @step_cond_add_pred(
+; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; IF-EVL-OUTLOOP-NEXT: entry:
+; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: for.body:
+; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP37:%.*]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-OUTLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
+; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP38]], [[IV_TRUNC]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[MIDDLE_BLOCK]]
+; IF-EVL-OUTLOOP: if.then:
+; IF-EVL-OUTLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[BC_MERGE_RDX]], [[TMP38]]
+; IF-EVL-OUTLOOP-NEXT: br label [[MIDDLE_BLOCK]]
+; IF-EVL-OUTLOOP: for.inc:
+; IF-EVL-OUTLOOP-NEXT: [[TMP37]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL-OUTLOOP: for.end:
+; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]]
+;
+; IF-EVL-INLOOP-LABEL: define i32 @step_cond_add_pred(
+; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; IF-EVL-INLOOP-NEXT: entry:
+; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-INLOOP: for.body:
+; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP32:%.*]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-INLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
+; IF-EVL-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP35]], [[IV_TRUNC]]
+; IF-EVL-INLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[MIDDLE_BLOCK]]
+; IF-EVL-INLOOP: if.then:
+; IF-EVL-INLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[BC_MERGE_RDX]], [[TMP35]]
+; IF-EVL-INLOOP-NEXT: br label [[MIDDLE_BLOCK]]
+; IF-EVL-INLOOP: for.inc:
+; IF-EVL-INLOOP-NEXT: [[TMP32]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6]]
+; IF-EVL-INLOOP: for.end:
+; IF-EVL-INLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[TMP32]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-INLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]]
+;
+; NO-VP-OUTLOOP-LABEL: define i32 @step_cond_add_pred(
+; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; NO-VP-OUTLOOP-NEXT: entry:
+; NO-VP-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP-OUTLOOP: vector.ph:
+; NO-VP-OUTLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
+; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
+; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP12]], zeroinitializer
+; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = mul <vscale x 4 x i32> [[TMP13]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP14]]
+; NO-VP-OUTLOOP-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4
+; NO-VP-OUTLOOP-NEXT: [[TMP17:%.*]] = mul i32 1, [[TMP16]]
+; NO-VP-OUTLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP17]], i64 0
+; NO-VP-OUTLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP-OUTLOOP: vector.body:
+; NO-VP-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP11]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-OUTLOOP-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
+; NO-VP-OUTLOOP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP27]], align 4
+; NO-VP-OUTLOOP-NEXT: [[TMP28:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_IND]]
+; NO-VP-OUTLOOP-NEXT: [[TMP22:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
+; NO-VP-OUTLOOP-NEXT: [[TMP23:%.*]] = xor <vscale x 4 x i1> [[TMP28]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP22]]
+; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; NO-VP-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; NO-VP-OUTLOOP-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-VP-OUTLOOP: middle.block:
+; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PREDPHI]])
+; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP-OUTLOOP: scalar.ph:
+; NO-VP-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP-OUTLOOP: for.body:
+; NO-VP-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; NO-VP-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ]
+; NO-VP-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-OUTLOOP-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-OUTLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
+; NO-VP-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP21]], [[IV_TRUNC]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; NO-VP-OUTLOOP: if.then:
+; NO-VP-OUTLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP21]]
+; NO-VP-OUTLOOP-NEXT: br label [[FOR_INC]]
+; NO-VP-OUTLOOP: for.inc:
+; NO-VP-OUTLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; NO-VP-OUTLOOP: for.end:
+; NO-VP-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; NO-VP-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]]
+;
+; NO-VP-INLOOP-LABEL: define i32 @step_cond_add_pred(
+; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; NO-VP-INLOOP-NEXT: entry:
+; NO-VP-INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; NO-VP-INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP-INLOOP: vector.ph:
+; NO-VP-INLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-INLOOP-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; NO-VP-INLOOP-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i32> [[TMP6]], zeroinitializer
+; NO-VP-INLOOP-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-INLOOP-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; NO-VP-INLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-INLOOP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; NO-VP-INLOOP-NEXT: [[TMP11:%.*]] = mul i32 1, [[TMP10]]
+; NO-VP-INLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
+; NO-VP-INLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP-INLOOP: vector.body:
+; NO-VP-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-INLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
+; NO-VP-INLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-VP-INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-INLOOP-NEXT: [[TMP15:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
+; NO-VP-INLOOP-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> zeroinitializer
+; NO-VP-INLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; NO-VP-INLOOP-NEXT: [[TMP18]] = add i32 [[TMP17]], [[VEC_PHI]]
+; NO-VP-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-INLOOP-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; NO-VP-INLOOP-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-INLOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-VP-INLOOP: middle.block:
+; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP-INLOOP: scalar.ph:
+; NO-VP-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP-INLOOP: for.body:
+; NO-VP-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; NO-VP-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ]
+; NO-VP-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-INLOOP-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-INLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
+; NO-VP-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP20]], [[IV_TRUNC]]
+; NO-VP-INLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; NO-VP-INLOOP: if.then:
+; NO-VP-INLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP20]]
+; NO-VP-INLOOP-NEXT: br label [[FOR_INC]]
+; NO-VP-INLOOP: for.inc:
+; NO-VP-INLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; NO-VP-INLOOP: for.end:
+; NO-VP-INLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; NO-VP-INLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+ %rdx = phi i32 [ %start, %entry ], [ %rdx.add, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %iv.trunc = trunc i64 %iv to i32
+ %cmp = icmp sgt i32 %0, %iv.trunc
+ br i1 %cmp, label %if.then, label %for.inc
+
+if.then:
+ %add.pred = add nsw i32 %rdx, %0
+ br label %for.inc
+
+for.inc:
+ %rdx.add = phi i32 [ %add.pred, %if.then ], [ %rdx, %for.body ]
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %rdx.add
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; IF-EVL-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; IF-EVL-OUTLOOP: [[META1]] = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; IF-EVL-INLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; IF-EVL-INLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; IF-EVL-INLOOP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; IF-EVL-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; IF-EVL-INLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; IF-EVL-INLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; IF-EVL-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]]}
+; IF-EVL-INLOOP: [[META7]] = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; NO-VP-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-VP-OUTLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-VP-OUTLOOP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-VP-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; NO-VP-OUTLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; NO-VP-OUTLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; NO-VP-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; NO-VP-OUTLOOP: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; NO-VP-OUTLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; NO-VP-OUTLOOP: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+;.
+; NO-VP-INLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-VP-INLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-VP-INLOOP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-VP-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; NO-VP-INLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; NO-VP-INLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; NO-VP-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; NO-VP-INLOOP: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; NO-VP-INLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; NO-VP-INLOOP: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
new file mode 100644
index 0000000000000..73dc3e4313a65
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
@@ -0,0 +1,1965 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-inloop-reductions \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-inloop-reductions \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
+
+define i32 @add(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @add(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[TMP18]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[ADD_LCSSA]]
+;
+; NO-VP-LABEL: @add(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[TMP10]] = add i32 [[TMP9]], [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[ADD]] = add nsw i32 [[TMP12]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %add = add nsw i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %add
+}
+
+; not support mul reduction for scalable vector
+define i32 @mul(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @mul(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MUL]] = mul nsw i32 [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[MUL_LCSSA]]
+;
+; NO-VP-LABEL: @mul(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[TMP7]] = mul i32 [[TMP6]], [[VEC_PHI]]
+; NO-VP-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD2]])
+; NO-VP-NEXT: [[TMP9]] = mul i32 [[TMP8]], [[VEC_PHI1]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[BIN_RDX:%.*]] = mul i32 [[TMP9]], [[TMP7]]
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[MUL]] = mul nsw i32 [[TMP11]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[MUL_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %mul, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %mul = mul nsw i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %mul
+}
+
+define i32 @or(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @or(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.or.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP15]] = or i32 [[TMP14]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[OR]] = or i32 [[TMP18]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[OR_LCSSA]]
+;
+; NO-VP-LABEL: @or(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[TMP10]] = or i32 [[TMP9]], [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[OR]] = or i32 [[TMP12]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[OR_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %or, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %or = or i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %or
+}
+
+define i32 @and(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @and(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.and.nxv4i32(i32 -1, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP15]] = and i32 [[TMP14]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[AND]] = and i32 [[TMP18]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[AND_LCSSA]]
+;
+; NO-VP-LABEL: @and(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[TMP10]] = and i32 [[TMP9]], [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[AND]] = and i32 [[TMP12]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[AND_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %and, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %and = and i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %and
+}
+
+define i32 @xor(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @xor(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.xor.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP15]] = xor i32 [[TMP14]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[XOR]] = xor i32 [[TMP18]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[XOR_LCSSA]]
+;
+; NO-VP-LABEL: @xor(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[TMP10]] = xor i32 [[TMP9]], [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[XOR]] = xor i32 [[TMP12]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[XOR_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %xor, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %xor = xor i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %xor
+}
+
+define i32 @smin(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @smin(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.smin.nxv4i32(i32 2147483647, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP14]], i32 [[VEC_PHI]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP17]], [[RDX]]
+; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[SMIN_LCSSA]]
+;
+; NO-VP-LABEL: @smin(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP9]], i32 [[VEC_PHI]])
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP11]], [[RDX]]
+; NO-VP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP11]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[SMIN_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %smin, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp slt i32 %0, %rdx
+ %smin = select i1 %cmp.i, i32 %0, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %smin
+}
+
+define i32 @smax(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @smax(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -2147483648, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smax.i32(i32 [[TMP14]], i32 [[VEC_PHI]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP17]], [[RDX]]
+; IF-EVL-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[SMAX_LCSSA]]
+;
+; NO-VP-LABEL: @smax(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smax.i32(i32 [[TMP9]], i32 [[VEC_PHI]])
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP11]], [[RDX]]
+; NO-VP-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP11]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[SMAX_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %smax, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp sgt i32 %0, %rdx
+ %smax = select i1 %cmp.i, i32 %0, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %smax
+}
+
+define i32 @umin(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @umin(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.umin.nxv4i32(i32 -1, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umin.i32(i32 [[TMP14]], i32 [[VEC_PHI]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP17]], [[RDX]]
+; IF-EVL-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[UMIN_LCSSA]]
+;
+; NO-VP-LABEL: @umin(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umin.i32(i32 [[TMP9]], i32 [[VEC_PHI]])
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP11]], [[RDX]]
+; NO-VP-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP11]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[UMIN_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %umin, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp ult i32 %0, %rdx
+ %umin = select i1 %cmp.i, i32 %0, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %umin
+}
+
+define i32 @umax(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @umax(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.umax.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP14]], i32 [[VEC_PHI]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP17]], [[RDX]]
+; IF-EVL-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[UMAX_LCSSA]]
+;
+; NO-VP-LABEL: @umax(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP9]], i32 [[VEC_PHI]])
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP11]], [[RDX]]
+; NO-VP-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP11]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[UMAX_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %umax, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp ugt i32 %0, %rdx
+ %umax = select i1 %cmp.i, i32 %0, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %umax
+}
+
+define float @fadd(ptr %a, i64 %n, float %start) {
+; IF-EVL-LABEL: @fadd(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = call reassoc float @llvm.vp.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP15]] = fadd reassoc float [[TMP14]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[ADD]] = fadd reassoc float [[TMP18]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret float [[ADD_LCSSA]]
+;
+; NO-VP-LABEL: @fadd(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[TMP10]] = fadd reassoc float [[TMP9]], [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[ADD]] = fadd reassoc float [[TMP12]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %add = fadd reassoc float %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %add
+}
+
+; not support fmul reduction for scalable vector
+define float @fmul(ptr %a, i64 %n, float %start) {
+; IF-EVL-LABEL: @fmul(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MUL]] = fmul reassoc float [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[MUL_LCSSA]]
+;
+; NO-VP-LABEL: @fmul(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; NO-VP-NEXT: [[TMP6:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[TMP7]] = fmul reassoc float [[TMP6]], [[VEC_PHI]]
+; NO-VP-NEXT: [[TMP8:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD2]])
+; NO-VP-NEXT: [[TMP9]] = fmul reassoc float [[TMP8]], [[VEC_PHI1]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[BIN_RDX:%.*]] = fmul reassoc float [[TMP9]], [[TMP7]]
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[MUL]] = fmul reassoc float [[TMP11]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[MUL_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %mul, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %mul = fmul reassoc float %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %mul
+}
+
+define float @fmin(ptr %a, i64 %n, float %start) #0 {
+; IF-EVL-LABEL: @fmin(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = call fast float @llvm.vp.reduce.fmin.nxv4f32(float 0x7FF0000000000000, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP14]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP14]], float [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP17]], [[RDX]]
+; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP17]], float [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret float [[MIN_LCSSA]]
+;
+; NO-VP-LABEL: @fmin(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP9]], [[VEC_PHI]]
+; NO-VP-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP9]], float [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP11]], [[RDX]]
+; NO-VP-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP11]], float [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[MIN_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %min, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp = fcmp fast olt float %0, %rdx
+ %min = select i1 %cmp, float %0, float %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %min
+}
+
+define float @fmax(ptr %a, i64 %n, float %start) #0 {
+; IF-EVL-LABEL: @fmax(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = call fast float @llvm.vp.reduce.fmax.nxv4f32(float 0xFFF0000000000000, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt float [[TMP14]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP14]], float [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP17]], [[RDX]]
+; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP17]], float [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret float [[MAX_LCSSA]]
+;
+; NO-VP-LABEL: @fmax(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt float [[TMP9]], [[VEC_PHI]]
+; NO-VP-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP9]], float [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP11]], [[RDX]]
+; NO-VP-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP11]], float [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[MAX_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %max, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp = fcmp fast ogt float %0, %rdx
+ %max = select i1 %cmp, float %0, float %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %max
+}
+
+define float @fminimum(ptr %a, i64 %n, float %start) {
+; IF-EVL-LABEL: @fminimum(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]])
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[MIN_LCSSA]]
+;
+; NO-VP-LABEL: @fminimum(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <8 x float> [[MINMAX_IDENT_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
+; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP4]], align 4
+; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
+; NO-VP-NEXT: [[TMP6]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[TMP7]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[RDX_MINMAX:%.*]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[TMP6]], <8 x float> [[TMP7]])
+; NO-VP-NEXT: [[TMP9:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[RDX_MINMAX]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP10]])
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[MIN_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %min, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %min = tail call float @llvm.minimum.f32(float %rdx, float %0)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %min
+}
+
+define float @fmaximum(ptr %a, i64 %n, float %start) {
+; IF-EVL-LABEL: @fmaximum(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]])
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[MAX_LCSSA]]
+;
+; NO-VP-LABEL: @fmaximum(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <8 x float> [[MINMAX_IDENT_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
+; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP4]], align 4
+; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
+; NO-VP-NEXT: [[TMP6]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[TMP7]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[RDX_MINMAX:%.*]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[TMP6]], <8 x float> [[TMP7]])
+; NO-VP-NEXT: [[TMP9:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[RDX_MINMAX]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP10]])
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[MAX_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %max, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %max = tail call float @llvm.maximum.f32(float %rdx, float %0)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %max
+}
+
+define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
+; IF-EVL-LABEL: @fmuladd(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = fmul reassoc <vscale x 4 x float> [[VP_OP_LOAD]], [[VP_OP_LOAD1]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = call reassoc float @llvm.vp.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP18]] = fadd reassoc float [[TMP17]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP21]], float [[TMP22]], float [[RDX]])
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret float [[MULADD_LCSSA]]
+;
+; NO-VP-LABEL: @fmuladd(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP10]], align 4
+; NO-VP-NEXT: [[TMP11:%.*]] = fmul reassoc <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; NO-VP-NEXT: [[TMP12:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP11]])
+; NO-VP-NEXT: [[TMP13]] = fadd reassoc float [[TMP12]], [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP15]], float [[TMP16]], float [[RDX]])
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[MULADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %muladd, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv
+ %1 = load float, ptr %arrayidx2, align 4
+ %muladd = tail call reassoc float @llvm.fmuladd.f32(float %0, float %1, float %rdx)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %muladd
+}
+
+define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
+; IF-EVL-LABEL: @anyof_icmp(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP0]], 3
+; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV:%.*]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]]
+;
+; NO-VP-LABEL: @anyof_icmp(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-NEXT: [[TMP10]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP9]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; NO-VP-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]]
+; NO-VP-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[INV:%.*]], i32 [[START:%.*]]
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP14]], 3
+; NO-VP-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[ANYOF_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %anyof, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp slt i32 %0, 3
+ %anyof = select i1 %cmp.i, i32 %inv, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %anyof
+}
+
+define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
+; IF-EVL-LABEL: @anyof_fcmp(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP0]], 3.000000e+00
+; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV:%.*]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]]
+;
+; NO-VP-LABEL: @anyof_fcmp(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = fcmp fast olt <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-NEXT: [[TMP10]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP9]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; NO-VP-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]]
+; NO-VP-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[INV:%.*]], i32 [[START:%.*]]
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP14]], 3.000000e+00
+; NO-VP-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[ANYOF_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %anyof, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp.i = fcmp fast olt float %0, 3.0
+ %anyof = select i1 %cmp.i, i32 %inv, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %anyof
+}
+
+declare float @llvm.minimum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
+declare float @llvm.fmuladd.f32(float, float, float)
+
+attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll
new file mode 100644
index 0000000000000..fcea9e8d81ff6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll
@@ -0,0 +1,265 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=IF-EVL-OUTLOOP
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-inloop-reductions \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=IF-EVL-INLOOP
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=NO-VP-OUTLOOP
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-inloop-reductions \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=NO-VP-INLOOP
+
+define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) {
+; IF-EVL-OUTLOOP-LABEL: define void @reduction_intermediate_store(
+; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
+; IF-EVL-OUTLOOP-NEXT: entry:
+; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: for.body:
+; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[TMP27]], [[RDX]]
+; IF-EVL-OUTLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4
+; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-OUTLOOP: for.end:
+; IF-EVL-OUTLOOP-NEXT: ret void
+;
+; IF-EVL-INLOOP-LABEL: define void @reduction_intermediate_store(
+; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
+; IF-EVL-INLOOP-NEXT: entry:
+; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]]
+; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-INLOOP-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP4]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; IF-EVL-INLOOP: vector.memcheck:
+; IF-EVL-INLOOP-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ADDR]], i64 4
+; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = shl i64 [[N]], 2
+; IF-EVL-INLOOP-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
+; IF-EVL-INLOOP-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ADDR]], [[SCEVGEP1]]
+; IF-EVL-INLOOP-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; IF-EVL-INLOOP-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; IF-EVL-INLOOP-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; IF-EVL-INLOOP: vector.ph:
+; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], 1
+; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP10]]
+; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP9]]
+; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-INLOOP: vector.body:
+; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP13]], i32 4, i1 true)
+; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
+; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP14]]), !alias.scope [[META0:![0-9]+]]
+; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP14]])
+; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
+; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP14]] to i64
+; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP12]]
+; IF-EVL-INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL-INLOOP: middle.block:
+; IF-EVL-INLOOP-NEXT: store i32 [[TMP22]], ptr [[ADDR]], align 4
+; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL-INLOOP: scalar.ph:
+; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_MEMCHECK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL-INLOOP: for.body:
+; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[TMP25]], [[RDX]]
+; IF-EVL-INLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4
+; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-INLOOP: for.end:
+; IF-EVL-INLOOP-NEXT: ret void
+;
+; NO-VP-OUTLOOP-LABEL: define void @reduction_intermediate_store(
+; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-VP-OUTLOOP-NEXT: entry:
+; NO-VP-OUTLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; NO-VP-OUTLOOP: vector.memcheck:
+; NO-VP-OUTLOOP-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ADDR]], i64 4
+; NO-VP-OUTLOOP-NEXT: [[TMP3:%.*]] = shl i64 [[N]], 2
+; NO-VP-OUTLOOP-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; NO-VP-OUTLOOP-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ADDR]], [[SCEVGEP1]]
+; NO-VP-OUTLOOP-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; NO-VP-OUTLOOP-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; NO-VP-OUTLOOP: vector.ph:
+; NO-VP-OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]]
+; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-OUTLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-OUTLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-OUTLOOP-NEXT: [[TMP8:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
+; NO-VP-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP-OUTLOOP: vector.body:
+; NO-VP-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
+; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; NO-VP-OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META0:![0-9]+]]
+; NO-VP-OUTLOOP-NEXT: [[TMP12]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP-OUTLOOP: middle.block:
+; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; NO-VP-OUTLOOP-NEXT: store i32 [[TMP14]], ptr [[ADDR]], align 4
+; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP-OUTLOOP: scalar.ph:
+; NO-VP-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; NO-VP-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_MEMCHECK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP-OUTLOOP: for.body:
+; NO-VP-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-OUTLOOP-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[TMP15]], [[RDX]]
+; NO-VP-OUTLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4
+; NO-VP-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-VP-OUTLOOP: for.end:
+; NO-VP-OUTLOOP-NEXT: ret void
+;
+; NO-VP-INLOOP-LABEL: define void @reduction_intermediate_store(
+; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-VP-INLOOP-NEXT: entry:
+; NO-VP-INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; NO-VP-INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; NO-VP-INLOOP: vector.memcheck:
+; NO-VP-INLOOP-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ADDR]], i64 4
+; NO-VP-INLOOP-NEXT: [[TMP3:%.*]] = shl i64 [[N]], 2
+; NO-VP-INLOOP-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; NO-VP-INLOOP-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ADDR]], [[SCEVGEP1]]
+; NO-VP-INLOOP-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; NO-VP-INLOOP-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; NO-VP-INLOOP-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; NO-VP-INLOOP: vector.ph:
+; NO-VP-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]]
+; NO-VP-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-INLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-INLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP-INLOOP: vector.body:
+; NO-VP-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-INLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; NO-VP-INLOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-VP-INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META0:![0-9]+]]
+; NO-VP-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; NO-VP-INLOOP-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]]
+; NO-VP-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; NO-VP-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP-INLOOP: middle.block:
+; NO-VP-INLOOP-NEXT: store i32 [[TMP12]], ptr [[ADDR]], align 4
+; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP-INLOOP: scalar.ph:
+; NO-VP-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; NO-VP-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_MEMCHECK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP-INLOOP: for.body:
+; NO-VP-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-INLOOP-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-INLOOP-NEXT: [[ADD]] = add nsw i32 [[TMP14]], [[RDX]]
+; NO-VP-INLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4
+; NO-VP-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-VP-INLOOP: for.end:
+; NO-VP-INLOOP-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %add = add nsw i32 %0, %rdx
+ store i32 %add, ptr %addr, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; IF-EVL-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; IF-EVL-OUTLOOP: [[META1]] = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; IF-EVL-INLOOP: [[META0]] = !{[[META1:![0-9]+]]}
+; IF-EVL-INLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; IF-EVL-INLOOP: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; IF-EVL-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]}
+; IF-EVL-INLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1}
+; IF-EVL-INLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"}
+; IF-EVL-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+;.
+; NO-VP-OUTLOOP: [[META0]] = !{[[META1:![0-9]+]]}
+; NO-VP-OUTLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; NO-VP-OUTLOOP: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; NO-VP-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]}
+; NO-VP-OUTLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-VP-OUTLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-VP-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+;.
+; NO-VP-INLOOP: [[META0]] = !{[[META1:![0-9]+]]}
+; NO-VP-INLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; NO-VP-INLOOP: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; NO-VP-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]}
+; NO-VP-INLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-VP-INLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-VP-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
new file mode 100644
index 0000000000000..314d30f86ee57
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-ordered-reductions=true -hints-allow-reordering=false \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-ordered-reductions=true -hints-allow-reordering=false \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
+
+define float @fadd(ptr noalias nocapture readonly %a, i64 %n) {
+; IF-EVL-LABEL: @fadd(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP14]] = call float @llvm.vp.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[ADD]] = fadd float [[TMP17]], [[SUM_07]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret float [[ADD_LCSSA]]
+;
+; NO-VP-LABEL: @fadd(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
+; NO-VP-NEXT: ret float [[ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %add = fadd float %0, %sum.07
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %add
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
new file mode 100644
index 0000000000000..2bbcd362ce16c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
@@ -0,0 +1,1534 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
+
+define i32 @add(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @add(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[ADD_LCSSA]]
+;
+; NO-VP-LABEL: @add(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: [[TMP6:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START:%.*]], i32 0
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
+; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT: [[TMP10]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[ADD]] = add nsw i32 [[TMP13]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %add = add nsw i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %add
+}
+
+; not support mul reduction for scalable vector
+define i32 @mul(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @mul(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MUL]] = mul nsw i32 [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[MUL_LCSSA]]
+;
+; NO-VP-LABEL: @mul(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 [[START:%.*]], i32 0
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
+; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
+; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 8
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
+; NO-VP-NEXT: [[TMP7]] = mul <8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[TMP8]] = mul <8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[BIN_RDX:%.*]] = mul <8 x i32> [[TMP8]], [[TMP7]]
+; NO-VP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[BIN_RDX]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[MUL]] = mul nsw i32 [[TMP11]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[MUL_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %mul, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %mul = mul nsw i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %mul
+}
+
+define i32 @or(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @or(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[OR]] = or i32 [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[OR_LCSSA]]
+;
+; NO-VP-LABEL: @or(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: [[TMP6:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START:%.*]], i32 0
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
+; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT: [[TMP10]] = or <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[OR]] = or i32 [[TMP13]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[OR_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %or, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %or = or i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %or
+}
+
+define i32 @and(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @and(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[AND:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[AND]] = and i32 [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[AND_LCSSA]]
+;
+; NO-VP-LABEL: @and(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: [[TMP6:%.*]] = insertelement <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), i32 [[START:%.*]], i32 0
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
+; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT: [[TMP10]] = and <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[AND]] = and i32 [[TMP13]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[AND_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %and, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %and = and i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %and
+}
+
+define i32 @xor(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @xor(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[XOR:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[XOR]] = xor i32 [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[XOR_LCSSA]]
+;
+; NO-VP-LABEL: @xor(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: [[TMP6:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START:%.*]], i32 0
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
+; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT: [[TMP10]] = xor <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[XOR]] = xor i32 [[TMP13]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[XOR_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %xor, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %xor = xor i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %xor
+}
+
+define i32 @smin(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @smin(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[SMIN_LCSSA]]
+;
+; NO-VP-LABEL: @smin(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP13]], [[RDX]]
+; NO-VP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP13]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[SMIN_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %smin, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp slt i32 %0, %rdx
+ %smin = select i1 %cmp.i, i32 %0, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %smin
+}
+
+define i32 @smax(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @smax(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[SMAX_LCSSA]]
+;
+; NO-VP-LABEL: @smax(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP13]], [[RDX]]
+; NO-VP-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP13]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[SMAX_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %smax, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp sgt i32 %0, %rdx
+ %smax = select i1 %cmp.i, i32 %0, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %smax
+}
+
+define i32 @umin(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @umin(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[UMIN_LCSSA]]
+;
+; NO-VP-LABEL: @umin(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = icmp ult <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP13]], [[RDX]]
+; NO-VP-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP13]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[UMIN_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %umin, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp ult i32 %0, %rdx
+ %umin = select i1 %cmp.i, i32 %0, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %umin
+}
+
+define i32 @umax(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-LABEL: @umax(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[UMAX_LCSSA]]
+;
+; NO-VP-LABEL: @umax(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = icmp ugt <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP13]], [[RDX]]
+; NO-VP-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP13]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[UMAX_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %umax, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp ugt i32 %0, %rdx
+ %umax = select i1 %cmp.i, i32 %0, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %umax
+}
+
+define float @fadd(ptr %a, i64 %n, float %start) {
+; IF-EVL-LABEL: @fadd(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[ADD]] = fadd reassoc float [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[ADD_LCSSA]]
+;
+; NO-VP-LABEL: @fadd(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: [[TMP6:%.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float [[START:%.*]], i32 0
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP7]]
+; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; NO-VP-NEXT: [[TMP10]] = fadd reassoc <vscale x 4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP10]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[ADD]] = fadd reassoc float [[TMP13]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %add = fadd reassoc float %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %add
+}
+
+; not support fmul reduction for scalable vector
+define float @fmul(ptr %a, i64 %n, float %start) {
+; IF-EVL-LABEL: @fmul(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MUL]] = fmul reassoc float [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[MUL_LCSSA]]
+;
+; NO-VP-LABEL: @fmul(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP0:%.*]] = insertelement <8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, float [[START:%.*]], i32 0
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
+; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP1]]
+; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
+; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 8
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
+; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP6]], align 4
+; NO-VP-NEXT: [[TMP7]] = fmul reassoc <8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[TMP8]] = fmul reassoc <8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[BIN_RDX:%.*]] = fmul reassoc <8 x float> [[TMP8]], [[TMP7]]
+; NO-VP-NEXT: [[TMP10:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 1.000000e+00, <8 x float> [[BIN_RDX]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[MUL]] = fmul reassoc float [[TMP11]], [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[MUL_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %mul, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %mul = fmul reassoc float %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %mul
+}
+
+define float @fmin(ptr %a, i64 %n, float %start) #0 {
+; IF-EVL-LABEL: @fmin(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP0]], float [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[MIN_LCSSA]]
+;
+; NO-VP-LABEL: @fmin(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[START:%.*]], i64 0
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = fcmp fast olt <vscale x 4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[TMP10]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP13]], [[RDX]]
+; NO-VP-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP13]], float [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[MIN_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %min, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp = fcmp fast olt float %0, %rdx
+ %min = select i1 %cmp, float %0, float %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %min
+}
+
+define float @fmax(ptr %a, i64 %n, float %start) #0 {
+; IF-EVL-LABEL: @fmax(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP0]], [[RDX]]
+; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP0]], float [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[MAX_LCSSA]]
+;
+; NO-VP-LABEL: @fmax(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[START:%.*]], i64 0
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = fcmp fast ogt <vscale x 4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; NO-VP-NEXT: [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[VEC_PHI]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[TMP10]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP13]], [[RDX]]
+; NO-VP-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP13]], float [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[MAX_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %max, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp = fcmp fast ogt float %0, %rdx
+ %max = select i1 %cmp, float %0, float %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %max
+}
+
+define float @fminimum(ptr %a, i64 %n, float %start) {
+; IF-EVL-LABEL: @fminimum(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]])
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[MIN_LCSSA]]
+;
+; NO-VP-LABEL: @fminimum(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <8 x float> [[MINMAX_IDENT_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
+; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP4]], align 4
+; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
+; NO-VP-NEXT: [[TMP6]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[TMP7]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[RDX_MINMAX:%.*]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[TMP6]], <8 x float> [[TMP7]])
+; NO-VP-NEXT: [[TMP9:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[RDX_MINMAX]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP10]])
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[MIN_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %min, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %min = tail call float @llvm.minimum.f32(float %rdx, float %0)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %min
+}
+
+define float @fmaximum(ptr %a, i64 %n, float %start) {
+; IF-EVL-LABEL: @fmaximum(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]])
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[MAX_LCSSA]]
+;
+; NO-VP-LABEL: @fmaximum(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
+; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <8 x float> [[MINMAX_IDENT_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
+; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP4]], align 4
+; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
+; NO-VP-NEXT: [[TMP6]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
+; NO-VP-NEXT: [[TMP7]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[RDX_MINMAX:%.*]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[TMP6]], <8 x float> [[TMP7]])
+; NO-VP-NEXT: [[TMP9:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[RDX_MINMAX]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP10]])
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[MAX_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %max, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %max = tail call float @llvm.maximum.f32(float %rdx, float %0)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %max
+}
+
+define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
+; IF-EVL-LABEL: @fmuladd(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[RDX]])
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret float [[MULADD_LCSSA]]
+;
+; NO-VP-LABEL: @fmuladd(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: [[TMP6:%.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float [[START:%.*]], i32 0
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP7]]
+; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP7]]
+; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
+; NO-VP-NEXT: [[TMP12]] = call reassoc <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD1]], <vscale x 4 x float> [[VEC_PHI]])
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP14:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP12]])
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP15]], float [[TMP16]], float [[RDX]])
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret float [[MULADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi float [ %start, %entry ], [ %muladd, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv
+ %1 = load float, ptr %arrayidx2, align 4
+ %muladd = tail call reassoc float @llvm.fmuladd.f32(float %0, float %1, float %rdx)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %muladd
+}
+
+define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
+; IF-EVL-LABEL: @anyof_icmp(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP0]], 3
+; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV:%.*]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]]
+;
+; NO-VP-LABEL: @anyof_icmp(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-NEXT: [[TMP10]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP9]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; NO-VP-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]]
+; NO-VP-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[INV:%.*]], i32 [[START:%.*]]
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP14]], 3
+; NO-VP-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[ANYOF_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %anyof, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp.i = icmp slt i32 %0, 3
+ %anyof = select i1 %cmp.i, i32 %inv, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %anyof
+}
+
+define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
+; IF-EVL-LABEL: @anyof_fcmp(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP0]], 3.000000e+00
+; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV:%.*]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]]
+;
+; NO-VP-LABEL: @anyof_fcmp(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[TMP9:%.*]] = fcmp fast olt <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-VP-NEXT: [[TMP10]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP9]]
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; NO-VP-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]]
+; NO-VP-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[INV:%.*]], i32 [[START:%.*]]
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP14]], 3.000000e+00
+; NO-VP-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]]
+; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
+; NO-VP: for.end:
+; NO-VP-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT: ret i32 [[ANYOF_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %anyof, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp.i = fcmp fast olt float %0, 3.0
+ %anyof = select i1 %cmp.i, i32 %inv, i32 %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %anyof
+}
+
+declare float @llvm.minimum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
+declare float @llvm.fmuladd.f32(float, float, float)
+
+attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
new file mode 100644
index 0000000000000..16db6cf828af8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -0,0 +1,166 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL-OUTLOOP %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -prefer-inloop-reductions \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL-INLOOP %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP-OUTLOOP %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -prefer-inloop-reductions \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP-INLOOP %s
+
+
+define i32 @reduction(ptr %a, i64 %n, i32 %start) {
+; IF-EVL-OUTLOOP-NOT: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+
+; IF-EVL-INLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+; IF-EVL-INLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-INLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-INLOOP-NEXT: Live-in ir<%n> = original trip-count
+; IF-EVL-INLOOP-EMPTY:
+; IF-EVL-INLOOP: vector.ph:
+; IF-EVL-INLOOP-NEXT: Successor(s): vector loop
+; IF-EVL-INLOOP-EMPTY:
+; IF-EVL-INLOOP-NEXT: <x1> vector loop: {
+; IF-EVL-INLOOP-NEXT: vector.body:
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-INLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
+; IF-EVL-INLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%n>
+; IF-EVL-INLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+; IF-EVL-INLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-INLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-INLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
+; IF-EVL-INLOOP-NEXT: REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + vp.reduce.add (ir<[[LD1]]>, vp<[[EVL]]>)
+; IF-EVL-INLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-INLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
+; IF-EVL-INLOOP-NEXT: No successors
+; IF-EVL-INLOOP-NEXT: }
+; IF-EVL-INLOOP-NEXT: Successor(s): middle.block
+; IF-EVL-INLOOP-EMPTY:
+; IF-EVL-INLOOP-NEXT: middle.block:
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; IF-EVL-INLOOP-NEXT: EMIT branch-on-cond ir<true>
+; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
+; IF-EVL-INLOOP-EMPTY:
+; IF-EVL-INLOOP-NEXT: ir-bb<for.end>:
+; IF-EVL-INLOOP-NEXT: No successors
+; IF-EVL-INLOOP-EMPTY:
+; IF-EVL-INLOOP-NEXT: scalar.ph:
+; IF-EVL-INLOOP-NEXT: No successors
+; IF-EVL-INLOOP-EMPTY:
+; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; IF-EVL-INLOOP-NEXT: }
+;
+
+; NO-VP-OUTLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; NO-VP-OUTLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; NO-VP-OUTLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-OUTLOOP-NEXT: Live-in ir<%n> = original trip-count
+; NO-VP-OUTLOOP-EMPTY:
+; NO-VP-OUTLOOP: vector.ph:
+; NO-VP-OUTLOOP-NEXT: Successor(s): vector loop
+; NO-VP-OUTLOOP-EMPTY:
+; NO-VP-OUTLOOP-NEXT: <x1> vector loop: {
+; NO-VP-OUTLOOP-NEXT: vector.body:
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]>
+; NO-VP-OUTLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; NO-VP-OUTLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; NO-VP-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; NO-VP-OUTLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
+; NO-VP-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD1]]>, ir<[[RDX_PHI]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-OUTLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
+; NO-VP-OUTLOOP-NEXT: No successors
+; NO-VP-OUTLOOP-NEXT: }
+; NO-VP-OUTLOOP-NEXT: Successor(s): middle.block
+; NO-VP-OUTLOOP-EMPTY:
+; NO-VP-OUTLOOP-NEXT: middle.block:
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
+; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]>
+; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
+; NO-VP-OUTLOOP-EMPTY:
+; NO-VP-OUTLOOP-NEXT: ir-bb<for.end>:
+; NO-VP-OUTLOOP-NEXT: No successors
+; NO-VP-OUTLOOP-EMPTY:
+; NO-VP-OUTLOOP-NEXT: scalar.ph:
+; NO-VP-OUTLOOP-NEXT: No successors
+; NO-VP-OUTLOOP-EMPTY:
+; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; NO-VP-OUTLOOP-NEXT: }
+;
+
+; NO-VP-INLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; NO-VP-INLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; NO-VP-INLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-INLOOP-NEXT: Live-in ir<%n> = original trip-count
+; NO-VP-INLOOP-EMPTY:
+; NO-VP-INLOOP: vector.ph:
+; NO-VP-INLOOP-NEXT: Successor(s): vector loop
+; NO-VP-INLOOP-EMPTY:
+; NO-VP-INLOOP-NEXT: <x1> vector loop: {
+; NO-VP-INLOOP-NEXT: vector.body:
+; NO-VP-INLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-INLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]>
+; NO-VP-INLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; NO-VP-INLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; NO-VP-INLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; NO-VP-INLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
+; NO-VP-INLOOP-NEXT: REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + reduce.add (ir<[[LD1]]>)
+; NO-VP-INLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-INLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
+; NO-VP-INLOOP-NEXT: No successors
+; NO-VP-INLOOP-NEXT: }
+; NO-VP-INLOOP-NEXT: Successor(s): middle.block
+; NO-VP-INLOOP-EMPTY:
+; NO-VP-INLOOP-NEXT: middle.block:
+; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-INLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
+; NO-VP-INLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]>
+; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
+; NO-VP-INLOOP-EMPTY:
+; NO-VP-INLOOP-NEXT: ir-bb<for.end>:
+; NO-VP-INLOOP-NEXT: No successors
+; NO-VP-INLOOP-EMPTY:
+; NO-VP-INLOOP-NEXT: scalar.ph:
+; NO-VP-INLOOP-NEXT: No successors
+; NO-VP-INLOOP-EMPTY:
+; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; NO-VP-INLOOP-NEXT: }
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %add = add nsw i32 %0, %rdx
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret i32 %add
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 4022c343d63ef..d9d6789134d88 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1133,6 +1133,20 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
EXPECT_FALSE(Recipe.mayReadOrWriteMemory());
}
+ {
+ VPValue ChainOp;
+ VPValue VecOp;
+ VPValue CondOp;
+ VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
+ &VecOp, false);
+ VPValue EVL;
+ VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL, &CondOp);
+ EXPECT_FALSE(EVLRecipe.mayHaveSideEffects());
+ EXPECT_FALSE(EVLRecipe.mayReadFromMemory());
+ EXPECT_FALSE(EVLRecipe.mayWriteToMemory());
+ EXPECT_FALSE(EVLRecipe.mayReadOrWriteMemory());
+ }
+
{
auto *Load =
new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
@@ -1472,6 +1486,21 @@ TEST(VPRecipeTest, CastVPReductionRecipeToVPUser) {
EXPECT_TRUE(isa<VPUser>(BaseR));
}
+TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
+ LLVMContext C;
+
+ VPValue ChainOp;
+ VPValue VecOp;
+ VPValue CondOp;
+ VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
+ &VecOp, false);
+ VPValue EVL;
+ VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL, &CondOp);
+ EXPECT_TRUE(isa<VPUser>(&EVLRecipe));
+ VPRecipeBase *BaseR = &EVLRecipe;
+ EXPECT_TRUE(isa<VPUser>(BaseR));
+}
+
struct VPDoubleValueDef : public VPRecipeBase {
VPDoubleValueDef(ArrayRef<VPValue *> Operands) : VPRecipeBase(99, Operands) {
new VPValue(nullptr, this);
More information about the llvm-commits
mailing list