[llvm] [LV] Support strided load with a stride of -1 (PR #128718)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 3 08:09:40 PDT 2025
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/128718
>From 00e256ca638b50e1c318d9dcf33d319f134e50f8 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 19 Feb 2025 01:37:53 -0800
Subject: [PATCH 01/16] Init: New Recipe VPWidenStridedLoadRecipe
---
.../Transforms/Vectorize/LoopVectorize.cpp | 112 ++++++-
llvm/lib/Transforms/Vectorize/VPlan.h | 67 ++++-
.../Transforms/Vectorize/VPlanAnalysis.cpp | 6 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 64 +++-
.../Transforms/Vectorize/VPlanTransforms.cpp | 19 +-
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 +
.../Transforms/Vectorize/VPlanVerifier.cpp | 2 +-
.../RISCV/riscv-vector-reverse-output.ll | 280 ++++++++----------
.../RISCV/riscv-vector-reverse.ll | 224 +++++++-------
...-force-tail-with-evl-reverse-load-store.ll | 97 +++---
10 files changed, 526 insertions(+), 346 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 907839711a39c..5bf80940617a5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1085,6 +1085,7 @@ class LoopVectorizationCostModel {
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
+ CM_Strided,
CM_Scalarize,
CM_VectorCall,
CM_IntrinsicCall
@@ -1315,6 +1316,20 @@ class LoopVectorizationCostModel {
return InterleaveInfo.getInterleaveGroup(Instr);
}
+ /// Returns true if \p I is a memory instruction with strided memory access
+ /// that can be vectorized.
+ bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
+
+ /// Get the stride of the strided memory access instruction \p Instr. Return 0
+ /// if the instruction \p Instr is not considered for vectorization as a
+ /// strided memory access.
+ int64_t getStride(Instruction *Instr) const {
+ auto It = StrideInfo.find(Instr);
+ if (It != StrideInfo.end())
+ return It->second;
+ return 0;
+ }
+
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
bool requiresScalarEpilogue(bool IsVectorizing) const {
@@ -1562,6 +1577,10 @@ class LoopVectorizationCostModel {
/// element)
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
+ /// The cost computation for strided load/store instruction.
+ InstructionCost getStridedLoadStoreCost(Instruction *I,
+ ElementCount VF) const;
+
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
InstructionCost getScalarizationOverhead(Instruction *I,
@@ -1701,6 +1720,9 @@ class LoopVectorizationCostModel {
Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
}
+ /// The mapping of memory access instructions to their stride values.
+ DenseMap<Instruction *, int64_t> StrideInfo;
+
public:
/// The loop that we evaluate.
Loop *TheLoop;
@@ -3276,6 +3298,31 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
return true;
}
+bool LoopVectorizationCostModel::stridedAccessCanBeWidened(
+ Instruction *I, ElementCount VF) const {
+ // Get and ensure we have a valid memory instruction.
+ assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
+
+ // Only support strided access for vector VF.
+ if (!VF.isVector())
+ return false;
+
+ // FIXME: Remove this check for StoreInst after strided store is supported.
+ if (isa<StoreInst>(I))
+ return false;
+
+ [[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I);
+ auto *ScalarTy = getLoadStoreType(I);
+ // TODO: Support non-unit-reverse strided accesses. Add stride analysis here
+ // to ensure that the accessed addresses are evenly spaced apart by a fixed
+ // stride.
+ assert(Legal->isConsecutivePtr(ScalarTy, Ptr) == -1 &&
+ "Only supports strided accesses with a stride of -1");
+
+ const Align Alignment = getLoadStoreAlignment(I);
+ return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment);
+}
+
void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// We should not collect Uniforms more than once per VF. Right now,
// this function is called from collectUniformsAndScalars(), which
@@ -3366,9 +3413,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (IsUniformMemOpUse(I))
return true;
- return (WideningDecision == CM_Widen ||
- WideningDecision == CM_Widen_Reverse ||
- WideningDecision == CM_Interleave);
+ return (
+ WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
+ WideningDecision == CM_Strided || WideningDecision == CM_Interleave);
};
// Returns true if Ptr is the pointer operand of a memory access instruction
@@ -4184,7 +4231,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
[](const auto *R) { return Instruction::Select; })
.Case<VPWidenStoreRecipe>(
[](const auto *R) { return Instruction::Store; })
- .Case<VPWidenLoadRecipe>(
+ .Case<VPWidenLoadRecipe, VPWidenStridedLoadRecipe>(
[](const auto *R) { return Instruction::Load; })
.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
[](const auto *R) { return Instruction::Call; })
@@ -4283,6 +4330,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenPointerInductionSC:
case VPDef::VPReductionPHISC:
case VPDef::VPInterleaveSC:
+ case VPDef::VPWidenStridedLoadSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
case VPDef::VPWidenStoreEVLSC:
@@ -5430,6 +5478,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
return Cost;
}
+InstructionCost
+LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
+ ElementCount VF) const {
+ Type *ValTy = getLoadStoreType(I);
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+ const Align Alignment = getLoadStoreAlignment(I);
+ const Value *Ptr = getLoadStorePointerOperand(I);
+
+ return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
+ Legal->isMaskRequired(I), Alignment,
+ CostKind, I);
+}
+
std::optional<InstructionCost>
LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
ElementCount VF,
@@ -5749,6 +5810,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
"Expected consecutive stride.");
InstWidening Decision =
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+ // Consider using strided load/store for consecutive reverse accesses to
+ // achieve more efficient memory operations.
+ if (ConsecutiveStride == -1 && stridedAccessCanBeWidened(&I, VF)) {
+ const InstructionCost StridedLoadStoreCost =
+ getStridedLoadStoreCost(&I, VF);
+ if (StridedLoadStoreCost < Cost) {
+ Decision = CM_Strided;
+ Cost = StridedLoadStoreCost;
+ StrideInfo[&I] = ConsecutiveStride;
+ }
+ }
setWideningDecision(&I, VF, Decision, Cost);
continue;
}
@@ -6395,6 +6467,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return TTI::CastContextHint::Normal;
switch (getWideningDecision(I, VF)) {
+ // TODO: New CastContextHint for strided accesses.
+ case LoopVectorizationCostModel::CM_Strided:
case LoopVectorizationCostModel::CM_GatherScatter:
return TTI::CastContextHint::GatherScatter;
case LoopVectorizationCostModel::CM_Interleave:
@@ -7752,16 +7826,27 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
// reverse consecutive.
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, Range.Start);
+
+ auto SameWiden = [&](ElementCount VF) -> bool {
+ return Decision == CM.getWideningDecision(I, VF);
+ };
+ bool ContainsWidenVF =
+ LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range);
+ assert(ContainsWidenVF &&
+ "At least widen the memory accesses by the Start VF.");
+
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
bool Consecutive =
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+ bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
- if (Consecutive) {
+ if (Consecutive || Strided) {
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
VPSingleDefRecipe *VectorPtr;
if (Reverse) {
+ assert(!Strided && "Reverse and Strided are mutually exclusive.");
// When folding the tail, we may compute an address that we don't in the
// original scalar loop and it may not be inbounds. Drop Inbounds in that
// case.
@@ -7773,7 +7858,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
/*Stride*/ -1, Flags, I->getDebugLoc());
} else {
- VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
+ VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), Strided,
GEP ? GEP->getNoWrapFlags()
: GEPNoWrapFlags::none(),
I->getDebugLoc());
@@ -7781,9 +7866,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}
- if (LoadInst *Load = dyn_cast<LoadInst>(I))
+ if (LoadInst *Load = dyn_cast<LoadInst>(I)) {
+ if (Strided) {
+ const DataLayout &DL = Load->getDataLayout();
+ auto *StrideTy = DL.getIndexType(Load->getPointerOperand()->getType());
+ int64_t Stride = CM.getStride(Load);
+ assert(Stride == -1 &&
+ "Only stride memory access with a stride of -1 is supported.");
+ VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
+ StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(Load))));
+ return new VPWidenStridedLoadRecipe(*Load, Ptr, StrideVPV, &Plan.getVF(),
+ Mask, VPIRMetadata(*Load, LVer),
+ I->getDebugLoc());
+ }
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
VPIRMetadata(*Load, LVer), I->getDebugLoc());
+ }
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c5b214b355545..569869e8e4bd4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -557,6 +557,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPBranchOnMaskSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
+ case VPRecipeBase::VPWidenStridedLoadSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
case VPRecipeBase::VPWidenStoreEVLSC:
@@ -1764,16 +1765,21 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
};
/// A recipe to compute the pointers for widened memory accesses of IndexTy.
+/// Supports both consecutive and reverse consecutive accesses.
+/// TODO: Support non-unit strided accesses .
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<1> {
Type *IndexedTy;
+ /// Indicate whether to compute the pointer for strided memory accesses.
+ bool Strided;
+
public:
- VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags,
- DebugLoc DL)
+ VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided,
+ GEPNoWrapFlags GEPFlags, DebugLoc DL)
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
GEPFlags, DL),
- IndexedTy(IndexedTy) {}
+ IndexedTy(IndexedTy), Strided(Strided) {}
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
@@ -1794,7 +1800,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
}
VPVectorPointerRecipe *clone() override {
- return new VPVectorPointerRecipe(getOperand(0), IndexedTy,
+ return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided,
getGEPNoWrapFlags(), getDebugLoc());
}
@@ -2931,7 +2937,8 @@ class VPWidenMemoryRecipe : public VPRecipeBase, public VPIRMetadata {
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
- R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
+ R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC;
}
static inline bool classof(const VPUser *U) {
@@ -3050,6 +3057,56 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
}
};
+/// A recipe for strided load operations, using the base address, stride, and an
+/// optional mask. This recipe will generate an vp.strided.load intrinsic call
+/// to represent memory accesses with a fixed stride.
+struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
+ public VPValue {
+ VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride,
+ VPValue *VF, VPValue *Mask,
+ const VPIRMetadata &Metadata, DebugLoc DL)
+ : VPWidenMemoryRecipe(
+ VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF},
+ /*Consecutive=*/false, /*Reverse=*/false, Metadata, DL),
+ VPValue(this, &Load) {
+ setMask(Mask);
+ }
+
+ VPWidenStridedLoadRecipe *clone() override {
+ return new VPWidenStridedLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
+ getStride(), getVF(), getMask(), *this,
+ getDebugLoc());
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC);
+
+ /// Return the stride operand.
+ VPValue *getStride() const { return getOperand(1); }
+
+ /// Return the VF operand.
+ VPValue *getVF() const { return getOperand(2); }
+
+ /// Generate a strided load.
+ void execute(VPTransformState &State) override;
+
+ /// Return the cost of this VPWidenStridedLoadRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getAddr() || Op == getStride() || Op == getVF();
+ }
+};
+
/// A recipe for widening store operations, using the stored value, the address
/// to store to and an optional mask.
struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 92db9674ef42b..714fef032c9b1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -184,8 +184,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
- assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
- "Store recipes should not define any values");
+ assert(
+ (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+ R)) &&
+ "Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 06511b61a67c3..f05c5b178a3e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -80,6 +80,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenCastSC:
case VPWidenGEPSC:
case VPWidenIntOrFpInductionSC:
+ case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenPHISC:
@@ -103,6 +104,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
+ case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
return true;
@@ -184,6 +186,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
}
case VPInterleaveSC:
return mayWriteToMemory();
+ case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenStoreEVLSC:
@@ -2386,8 +2389,13 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
Value *Ptr = State.get(getOperand(0), VPLane(0));
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
+ // TODO: Support non-unit-reverse strided accesses.
+ Value *Index =
+ Strided
+ ? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1))
+ : Increment;
Value *ResultPtr =
- Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
+ Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
State.set(this, ResultPtr, /*IsScalar*/ true);
}
@@ -3226,6 +3234,60 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+ auto &Builder = State.Builder;
+ State.setDebugLocFrom(getDebugLoc());
+ Value *Addr = State.get(getAddr(), /*IsScalar*/ true);
+ Value *Stride = State.get(getStride(), /*IsScalar*/ true);
+ Value *Mask = nullptr;
+ if (VPValue *VPMask = getMask())
+ Mask = State.get(VPMask);
+ else
+ Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+ Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)),
+ Builder.getInt32Ty());
+
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = Stride->getType();
+ CallInst *NewLI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+ {Addr, Stride, Mask, RunTimeVF}, nullptr, "wide.strided.load");
+ NewLI->addParamAttr(
+ 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+ applyMetadata(*NewLI);
+ State.set(this, NewLI);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN ";
+ printAsOperand(O, SlotTracker);
+ O << " = load ";
+ getAddr()->printAsOperand(O, SlotTracker);
+ O << ", stride = ";
+ getStride()->printAsOperand(O, SlotTracker);
+ O << ", runtimeVF = ";
+ getVF()->printAsOperand(O, SlotTracker);
+}
+#endif
+
+InstructionCost
+VPWidenStridedLoadRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+ const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+
+ return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
+ IsMasked, Alignment, Ctx.CostKind,
+ &Ingredient);
+}
+
void VPWidenStoreRecipe::execute(VPTransformState &State) {
VPValue *StoredVPValue = getStoredValue();
bool CreateScatter = !isConsecutive();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 931d4d42f56e4..75113706df420 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2144,6 +2144,12 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
VPValue *NewMask = GetNewMask(L->getMask());
return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
})
+ .Case<VPWidenStridedLoadRecipe>([&](VPWidenStridedLoadRecipe *L) {
+ VPValue *NewMask = GetNewMask(L->getMask());
+ return new VPWidenStridedLoadRecipe(
+ *cast<LoadInst>(&L->getIngredient()), L->getAddr(), L->getStride(),
+ &EVL, NewMask, *L, L->getDebugLoc());
+ })
.Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
VPValue *NewMask = GetNewMask(S->getMask());
return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
@@ -2198,10 +2204,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
- assert(all_of(Plan.getVF().users(),
- IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
- VPWidenIntOrFpInductionRecipe>) &&
- "User of VF that we can't transform to EVL.");
+ assert(
+ all_of(
+ Plan.getVF().users(),
+ IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
+ VPWidenIntOrFpInductionRecipe, VPWidenStridedLoadRecipe>) &&
+ "User of VF that we can't transform to EVL.");
Plan.getVF().replaceAllUsesWith(&EVL);
// Create a scalar phi to track the previous EVL if fixed-order recurrence is
@@ -2240,7 +2248,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
NumDefVal <= 1 &&
"Only supports recipes with a single definition or without users.");
EVLRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe,
+ VPWidenStridedLoadRecipe>(EVLRecipe)) {
VPValue *CurVPV = CurRecipe->getVPSingleValue();
CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 279cdac92d2d1..d9b1f7d4f5d53 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -340,6 +340,7 @@ class VPDef {
VPPartialReductionSC,
VPReplicateSC,
VPScalarIVStepsSC,
+ VPWidenStridedLoadSC,
VPVectorPointerSC,
VPVectorEndPointerSC,
VPWidenCallSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 38ada33d7ee19..bc9d40834c185 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -157,7 +157,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
return VerifyEVLUse(*S, S->getNumOperands() - 1);
})
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe,
- VPWidenIntOrFpInductionRecipe>(
+ VPWidenIntOrFpInductionRecipe, VPWidenStridedLoadRecipe>(
[&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
.Case<VPScalarIVStepsRecipe>([&](auto *R) {
if (R->getNumOperands() != 3) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index 29b27cdb7556d..d53fb60e7c7c9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -37,27 +37,23 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64: [[VECTOR_BODY]]:
; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
-; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]]
-; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]]
-; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
-; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV64-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
-; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]]
-; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]]
-; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; RV64-NEXT: store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP19]], align 4
+; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]]
+; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; RV64-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
+; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP9]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; RV64-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
+; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
+; RV64-NEXT: [[TMP13:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP14]]
+; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP13]]
+; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]]
+; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
+; RV64-NEXT: store <vscale x 4 x i32> [[REVERSE]], ptr [[TMP17]], align 4
; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; RV64: [[MIDDLE_BLOCK]]:
; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -68,8 +64,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP21]], 1
+; RV64-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
+; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP19]], 1
; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
; RV64-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -96,29 +92,24 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV32: [[VECTOR_BODY]]:
; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
+; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]]
+; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1
-; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
-; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]]
-; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]]
-; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV32-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1
-; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
-; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]]
-; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]]
-; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; RV32-NEXT: store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP21]], align 4
+; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP9]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; RV32-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
+; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
+; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]]
+; RV32-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], 1
+; RV32-NEXT: [[TMP16:%.*]] = mul i32 -1, [[TMP15]]
+; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[TMP14]]
+; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 [[TMP16]]
+; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
+; RV32-NEXT: store <vscale x 4 x i32> [[REVERSE]], ptr [[TMP18]], align 4
; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; RV32: [[MIDDLE_BLOCK]]:
; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -129,8 +120,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP23]], 1
+; RV32-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
+; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP20]], 1
; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
; RV32-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -158,39 +149,34 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2: [[VECTOR_BODY]]:
; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]]
-; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
-; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]]
-; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]]
-; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
-; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]]
-; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
-; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP18]], align 4
-; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD1]])
-; RV64-UF2-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV64-UF2-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
-; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
-; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
-; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
-; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
+; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1
+; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP13]]
+; RV64-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP5]] to i32
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; RV64-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP5]] to i32
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP14]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
+; RV64-UF2-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD1]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 -1, [[TMP21]]
+; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP20]]
+; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]]
+; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]]
+; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP25]]
; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP25]], align 4
-; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP29]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE]], ptr [[TMP24]], align 4
+; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE2]], ptr [[TMP29]], align 4
; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -251,27 +237,23 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64: [[VECTOR_BODY]]:
; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
-; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
-; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]]
-; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
-; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV64-NEXT: [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
-; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
-; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]]
-; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]])
-; RV64-NEXT: store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP19]], align 4
+; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
+; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; RV64-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
+; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP9]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; RV64-NEXT: [[TMP11:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
+; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; RV64-NEXT: [[TMP13:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP14]]
+; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP13]]
+; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP15]]
+; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP11]])
+; RV64-NEXT: store <vscale x 4 x float> [[REVERSE]], ptr [[TMP17]], align 4
; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; RV64: [[MIDDLE_BLOCK]]:
; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -282,8 +264,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP21]], 1.000000e+00
+; RV64-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
+; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP19]], 1.000000e+00
; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
; RV64-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -310,29 +292,24 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV32: [[VECTOR_BODY]]:
; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
+; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
+; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1
-; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
-; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]]
-; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]]
-; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
-; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV32-NEXT: [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1
-; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
-; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]]
-; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]]
-; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]])
-; RV32-NEXT: store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP21]], align 4
+; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr align 4 [[TMP9]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; RV32-NEXT: [[TMP11:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
+; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; RV32-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT: [[TMP14:%.*]] = mul i32 0, [[TMP13]]
+; RV32-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], 1
+; RV32-NEXT: [[TMP16:%.*]] = mul i32 -1, [[TMP15]]
+; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP14]]
+; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP16]]
+; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP11]])
+; RV32-NEXT: store <vscale x 4 x float> [[REVERSE]], ptr [[TMP18]], align 4
; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; RV32: [[MIDDLE_BLOCK]]:
; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -343,8 +320,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP23]], 1.000000e+00
+; RV32-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
+; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP20]], 1.000000e+00
; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
; RV32-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -372,39 +349,34 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2: [[VECTOR_BODY]]:
; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
-; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
-; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]]
-; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]]
-; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
-; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]]
-; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]]
-; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP18]], align 4
-; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1]])
-; RV64-UF2-NEXT: [[TMP19:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
-; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
-; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
+; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
+; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1
+; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP13]]
+; RV64-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP5]] to i32
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; RV64-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP5]] to i32
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP14]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
+; RV64-UF2-NEXT: [[TMP17:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD1]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 -1, [[TMP21]]
+; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP20]]
+; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP22]]
+; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]]
+; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP25]]
; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP19]])
-; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP25]], align 4
-; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP20]])
-; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP29]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP17]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE]], ptr [[TMP24]], align 4
+; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP18]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE2]], ptr [[TMP29]], align 4
; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index b4e49a60e0887..416eba01440b1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -38,7 +38,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
@@ -80,9 +80,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT: Live-in vp<%0> = VF
-; CHECK-NEXT: Live-in vp<%1> = VF * UF
-; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: vp<%3> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body.preheader>:
@@ -97,20 +97,20 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1>
-; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0>
-; CHECK-NEXT: WIDEN ir<%1> = load vp<%9>
-; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0>
-; CHECK-NEXT: WIDEN store vp<%10>, ir<%add9>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
-; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
+; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
; CHECK-NEXT: Successor(s): middle.block
@@ -147,7 +147,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
@@ -178,7 +178,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 24
+; CHECK-NEXT: LV: Loop cost is 23
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
; CHECK-NEXT: LV: Not Interleaving.
@@ -189,9 +189,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Loop does not require scalar epilogue
; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT: Live-in ir<%18> = VF
-; CHECK-NEXT: Live-in ir<%18>.1 = VF * UF
-; CHECK-NEXT: Live-in ir<%n.vec> = vector-trip-count
+; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in ir<[[VF]]>.1 = VF * UF
+; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: Live-in ir<%0> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body.preheader>:
@@ -232,19 +232,19 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
-; CHECK-NEXT: WIDEN ir<%19> = load vp<%4>
-; CHECK-NEXT: WIDEN ir<%add9> = add ir<%19>, ir<1>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT: WIDEN store vp<%5>, ir<%add9>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
-; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec>
+; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
+; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1
+; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
; CHECK-NEXT: Successor(s): middle.block, vector.body
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
@@ -333,25 +333,21 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1
; CHECK-NEXT: %22 = zext i32 %21 to i64
; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22
-; CHECK-NEXT: %24 = mul i64 0, %18
-; CHECK-NEXT: %25 = sub i64 %18, 1
-; CHECK-NEXT: %26 = mul i64 -1, %25
-; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %23, i64 %24
-; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %27, i64 %26
-; CHECK-NEXT: %wide.load = load <vscale x 4 x i32>, ptr %28, align 4
-; CHECK-NEXT: %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %wide.load)
-; CHECK-NEXT: %29 = add <vscale x 4 x i32> %reverse, splat (i32 1)
-; CHECK-NEXT: %30 = getelementptr inbounds i32, ptr %A, i64 %22
-; CHECK-NEXT: %31 = mul i64 0, %18
-; CHECK-NEXT: %32 = sub i64 %18, 1
-; CHECK-NEXT: %33 = mul i64 -1, %32
-; CHECK-NEXT: %34 = getelementptr inbounds i32, ptr %30, i64 %31
-; CHECK-NEXT: %35 = getelementptr inbounds i32, ptr %34, i64 %33
-; CHECK-NEXT: %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %29)
-; CHECK-NEXT: store <vscale x 4 x i32> %reverse4, ptr %35, align 4
+; CHECK-NEXT: %24 = getelementptr inbounds i32, ptr %23, i32 0
+; CHECK-NEXT: %25 = trunc i64 %18 to i32
+; CHECK-NEXT: %wide.strided.load = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 %24, i64 -4, <vscale x 4 x i1> splat (i1 true), i32 %25)
+; CHECK-NEXT: %26 = add <vscale x 4 x i32> %wide.strided.load, splat (i32 1)
+; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %A, i64 %22
+; CHECK-NEXT: %28 = mul i64 0, %18
+; CHECK-NEXT: %29 = sub i64 %18, 1
+; CHECK-NEXT: %30 = mul i64 -1, %29
+; CHECK-NEXT: %31 = getelementptr inbounds i32, ptr %27, i64 %28
+; CHECK-NEXT: %32 = getelementptr inbounds i32, ptr %31, i64 %30
+; CHECK-NEXT: %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %26)
+; CHECK-NEXT: store <vscale x 4 x i32> %reverse, ptr %32, align 4
; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %36, <null operand!>, label %vector.body
+; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1 %33, <null operand!>, label %vector.body
; CHECK-NEXT: LV: created middle.block
; CHECK-NEXT: LV: draw edge from vector.body
; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block
@@ -368,7 +364,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
-; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
+; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
; CHECK-NEXT: br label %for.body
; CHECK-NEXT: LV: draw edge from middle.block
; CHECK-NEXT: LV: draw edge from for.body.preheader
@@ -378,12 +374,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph
; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
+; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ]
; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: %37 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT: %add9 = add i32 %37, 1
+; CHECK-NEXT: %34 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: %add9 = add i32 %34, 1
; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4
; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
@@ -449,7 +445,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
@@ -491,9 +487,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT: Live-in vp<%0> = VF
-; CHECK-NEXT: Live-in vp<%1> = VF * UF
-; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: vp<%3> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body.preheader>:
@@ -508,20 +504,20 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1>
-; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0>
-; CHECK-NEXT: WIDEN ir<%1> = load vp<%9>
-; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0>
-; CHECK-NEXT: WIDEN store vp<%10>, ir<%conv1>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
-; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
+; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
; CHECK-NEXT: Successor(s): middle.block
@@ -558,7 +554,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
@@ -589,7 +585,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 26
+; CHECK-NEXT: LV: Loop cost is 25
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
; CHECK-NEXT: LV: Not Interleaving.
@@ -600,9 +596,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Loop does not require scalar epilogue
; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT: Live-in ir<%18> = VF
-; CHECK-NEXT: Live-in ir<%18>.1 = VF * UF
-; CHECK-NEXT: Live-in ir<%n.vec> = vector-trip-count
+; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in ir<[[VF]]>.1 = VF * UF
+; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: Live-in ir<%0> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body.preheader>:
@@ -643,19 +639,19 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
-; CHECK-NEXT: WIDEN ir<%19> = load vp<%4>
-; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT: WIDEN store vp<%5>, ir<%conv1>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
-; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec>
+; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
+; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1
+; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
; CHECK-NEXT: Successor(s): middle.block, vector.body
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
@@ -744,25 +740,21 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1
; CHECK-NEXT: %22 = zext i32 %21 to i64
; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22
-; CHECK-NEXT: %24 = mul i64 0, %18
-; CHECK-NEXT: %25 = sub i64 %18, 1
-; CHECK-NEXT: %26 = mul i64 -1, %25
-; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %23, i64 %24
-; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %27, i64 %26
-; CHECK-NEXT: %wide.load = load <vscale x 4 x float>, ptr %28, align 4
-; CHECK-NEXT: %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %wide.load)
-; CHECK-NEXT: %29 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
-; CHECK-NEXT: %30 = getelementptr inbounds float, ptr %A, i64 %22
-; CHECK-NEXT: %31 = mul i64 0, %18
-; CHECK-NEXT: %32 = sub i64 %18, 1
-; CHECK-NEXT: %33 = mul i64 -1, %32
-; CHECK-NEXT: %34 = getelementptr inbounds float, ptr %30, i64 %31
-; CHECK-NEXT: %35 = getelementptr inbounds float, ptr %34, i64 %33
-; CHECK-NEXT: %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %29)
-; CHECK-NEXT: store <vscale x 4 x float> %reverse4, ptr %35, align 4
+; CHECK-NEXT: %24 = getelementptr inbounds float, ptr %23, i32 0
+; CHECK-NEXT: %25 = trunc i64 %18 to i32
+; CHECK-NEXT: %wide.strided.load = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 %24, i64 -4, <vscale x 4 x i1> splat (i1 true), i32 %25)
+; CHECK-NEXT: %26 = fadd <vscale x 4 x float> %wide.strided.load, splat (float 1.000000e+00)
+; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %A, i64 %22
+; CHECK-NEXT: %28 = mul i64 0, %18
+; CHECK-NEXT: %29 = sub i64 %18, 1
+; CHECK-NEXT: %30 = mul i64 -1, %29
+; CHECK-NEXT: %31 = getelementptr inbounds float, ptr %27, i64 %28
+; CHECK-NEXT: %32 = getelementptr inbounds float, ptr %31, i64 %30
+; CHECK-NEXT: %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %26)
+; CHECK-NEXT: store <vscale x 4 x float> %reverse, ptr %32, align 4
; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %36, <null operand!>, label %vector.body
+; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1 %33, <null operand!>, label %vector.body
; CHECK-NEXT: LV: created middle.block
; CHECK-NEXT: LV: draw edge from vector.body
; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block
@@ -779,7 +771,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
-; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
+; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
; CHECK-NEXT: br label %for.body
; CHECK-NEXT: LV: draw edge from middle.block
; CHECK-NEXT: LV: draw edge from for.body.preheader
@@ -789,12 +781,12 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph
; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
+; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ]
; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: %37 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT: %conv1 = fadd float %37, 1.000000e+00
+; CHECK-NEXT: %34 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: %conv1 = fadd float %34, 1.000000e+00
; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4
; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 91d94e52d0990..c156fc14a2300 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -29,39 +29,33 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1
-; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]]
-; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[TMP18]], 1
-; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 -1, [[TMP11]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
-; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
-; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 [[TMP19]], 1
-; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP23]]
-; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
-; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
-; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -1
+; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP6]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP8]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP6]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = zext i32 [[TMP5]] to i64
+; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP10]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = sub i64 [[TMP10]], 1
+; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 -1, [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP9]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP14]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE]], ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP5]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP16]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br label [[LOOPEND:%.*]]
; IF-EVL: scalar.ph:
; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[STARTVAL]], [[ENTRY:%.*]] ]
-; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ 0, [[ENTRY]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[ENTRY]] ]
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL: for.body:
; IF-EVL-NEXT: [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
; IF-EVL-NEXT: [[ADD]] = add i64 [[ADD_PHI]], -1
; IF-EVL-NEXT: [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]]
; IF-EVL-NEXT: [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
@@ -129,37 +123,30 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
-; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], -1
-; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX3]]
-; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
-; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]]
-; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 [[TMP26]], 1
-; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP15]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
-; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
-; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP27]], 1
-; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP30]]
-; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
-; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[OFFSET_IDX1:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
+; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -1
+; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX1]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP9:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[TMP6]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP11]], i64 -4, <vscale x 4 x i1> [[TMP9]], i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP6]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP5]] to i64
+; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP13]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 [[TMP13]], 1
+; IF-EVL-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP15]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP12]], i64 [[TMP14]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP16]]
+; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE]], ptr align 4 [[TMP18]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; IF-EVL-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br label [[LOOPEND:%.*]]
; IF-EVL: scalar.ph:
>From f81211c5cd7aa0357ffb6f8e82771991b09f7dc8 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 14 May 2025 02:02:08 -0700
Subject: [PATCH 02/16] [WIP][VPlan Based] Try to remove CM_Strided from
uniform analysis
Also cherry-pick the branch Mel-Chen:legalizeAndOptimizeInductions
However, still not work well as collectLoopUniforms if the use-chain is
too compilicated. :(
---
.../Transforms/Vectorize/LoopVectorize.cpp | 6 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 22 +-
.../RISCV/riscv-vector-reverse.ll | 240 ++++++++++--------
3 files changed, 146 insertions(+), 122 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5bf80940617a5..f2c742cf62927 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3413,9 +3413,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (IsUniformMemOpUse(I))
return true;
- return (
- WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
- WideningDecision == CM_Strided || WideningDecision == CM_Interleave);
+ return (WideningDecision == CM_Widen ||
+ WideningDecision == CM_Widen_Reverse ||
+ WideningDecision == CM_Interleave);
};
// Returns true if Ptr is the pointer operand of a memory access instruction
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 75113706df420..8b359d53e3afb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -627,13 +627,14 @@ static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
static void legalizeAndOptimizeInductions(VPlan &Plan) {
using namespace llvm::VPlanPatternMatch;
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
- bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
- VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
- for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
- auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
- if (!PhiR)
- continue;
+ SmallVector<VPWidenInductionRecipe *, 4> InductionPhis;
+ for (VPRecipeBase &R : HeaderVPBB->phis())
+ if (auto *IV = dyn_cast<VPWidenInductionRecipe>(&R))
+ InductionPhis.push_back(IV);
+ bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
+ VPBuilder Builder;
+ for (VPWidenInductionRecipe *PhiR : reverse(InductionPhis)) {
// Try to narrow wide and replicating recipes to uniform recipes, based on
// VPlan analysis.
// TODO: Apply to all recipes in the future, to replace legacy uniformity
@@ -643,7 +644,8 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
auto *Def = dyn_cast<VPSingleDefRecipe>(U);
auto *RepR = dyn_cast<VPReplicateRecipe>(U);
// Skip recipes that shouldn't be narrowed.
- if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
+ if (!Def ||
+ !isa<VPReplicateRecipe, VPWidenRecipe, VPWidenGEPRecipe>(Def) ||
Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
(RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
continue;
@@ -656,11 +658,13 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
Def->operands(), /*IsUniform*/ true);
Clone->insertAfter(Def);
Def->replaceAllUsesWith(Clone);
+ Def->eraseFromParent();
}
+ Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
// Replace wide pointer inductions which have only their scalars used by
// PtrAdd(IndStart, ScalarIVSteps (0, Step)).
- if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
+ if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(PhiR)) {
if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
continue;
@@ -681,7 +685,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
// Replace widened induction with scalar steps for users that only use
// scalars.
- auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+ auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(PhiR);
if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
return U->usesScalars(WideIV);
}))
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 416eba01440b1..61c380ca079b9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -25,18 +25,13 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295.
; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
@@ -73,9 +68,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: No successors
; CHECK-NEXT: }
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
@@ -98,10 +90,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1>
+; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]>
@@ -144,8 +135,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: }
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
@@ -158,27 +149,26 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV(REG): At #0 Interval # 0
; CHECK-NEXT: LV(REG): At #1 Interval # 1
; CHECK-NEXT: LV(REG): At #2 Interval # 2
-; CHECK-NEXT: LV(REG): At #3 Interval # 2
-; CHECK-NEXT: LV(REG): At #4 Interval # 2
-; CHECK-NEXT: LV(REG): At #5 Interval # 2
-; CHECK-NEXT: LV(REG): At #6 Interval # 3
-; CHECK-NEXT: LV(REG): At #7 Interval # 3
-; CHECK-NEXT: LV(REG): At #8 Interval # 3
-; CHECK-NEXT: LV(REG): At #9 Interval # 3
-; CHECK-NEXT: LV(REG): At #10 Interval # 3
+; CHECK-NEXT: LV(REG): At #3 Interval # 3
+; CHECK-NEXT: LV(REG): At #4 Interval # 3
+; CHECK-NEXT: LV(REG): At #5 Interval # 4
+; CHECK-NEXT: LV(REG): At #6 Interval # 4
+; CHECK-NEXT: LV(REG): At #7 Interval # 4
+; CHECK-NEXT: LV(REG): At #8 Interval # 4
+; CHECK-NEXT: LV(REG): At #9 Interval # 4
+; CHECK-NEXT: LV(REG): At #10 Interval # 4
; CHECK-NEXT: LV(REG): At #11 Interval # 3
-; CHECK-NEXT: LV(REG): At #12 Interval # 2
-; CHECK-NEXT: LV(REG): At #13 Interval # 2
+; CHECK-NEXT: LV(REG): At #12 Interval # 3
; CHECK-NEXT: LV(REG): VF = vscale x 4
; CHECK-NEXT: LV(REG): Found max usage: 2 item
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers
; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 23
+; CHECK-NEXT: LV: Loop cost is 27
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
; CHECK-NEXT: LV: Not Interleaving.
@@ -229,13 +219,21 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4
; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT: EMIT vp<%3> = step-vector i32
+; CHECK-NEXT: EMIT vp<%4> = broadcast ir<%n>
+; CHECK-NEXT: EMIT vp<%5> = broadcast ir<-1>
+; CHECK-NEXT: EMIT vp<%6> = mul vp<%3>, vp<%5>
+; CHECK-NEXT: EMIT vp<[[IV_START:%.+]]> = add vp<%4>, vp<%6>
+; CHECK-NEXT: EMIT-SCALAR vp<%7> = trunc ir<%18> to i32
+; CHECK-NEXT: EMIT vp<%8> = mul ir<-1>, vp<%7>
+; CHECK-NEXT: EMIT vp<[[IV_INC:%.+]]> = broadcast vp<%8>
; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
-; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: WIDEN-PHI ir<[[WIDEN_IV:%.+]]> = phi [ vp<[[IV_START]]>, ir-bb<vector.ph> ], [ vp<[[IV_NEXT:%.+]]>, vector.body ]
+; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1>
+; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]>
@@ -244,6 +242,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]>
; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1
+; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add ir<[[WIDEN_IV]]>, vp<[[IV_INC]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
; CHECK-NEXT: Successor(s): middle.block, vector.body
; CHECK-EMPTY:
@@ -320,6 +319,15 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %19 = sub i64 %0, %n.vec
; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32
; CHECK-NEXT: %20 = sub i32 %n, %.cast
+; CHECK-NEXT: %21 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT: %broadcast.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %n, i64 0
+; CHECK-NEXT: %broadcast.splat = shufflevector <vscale x 4 x i32> %broadcast.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: %22 = mul <vscale x 4 x i32> %21, splat (i32 -1)
+; CHECK-NEXT: %induction = add <vscale x 4 x i32> %broadcast.splat, %22
+; CHECK-NEXT: %23 = trunc i64 %18 to i32
+; CHECK-NEXT: %24 = mul i32 -1, %23
+; CHECK-NEXT: %broadcast.splatinsert3 = insertelement <vscale x 4 x i32> poison, i32 %24, i64 0
+; CHECK-NEXT: %broadcast.splat4 = shufflevector <vscale x 4 x i32> %broadcast.splatinsert3, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br
; CHECK-NEXT: LV: draw edge from vector.memcheck
; CHECK-NEXT: LV: created vector.body
@@ -328,26 +336,28 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph
; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ]
-; CHECK-NEXT: %.cast3 = trunc i64 %index to i32
-; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3
-; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1
-; CHECK-NEXT: %22 = zext i32 %21 to i64
-; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22
-; CHECK-NEXT: %24 = getelementptr inbounds i32, ptr %23, i32 0
-; CHECK-NEXT: %25 = trunc i64 %18 to i32
-; CHECK-NEXT: %wide.strided.load = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 %24, i64 -4, <vscale x 4 x i1> splat (i1 true), i32 %25)
-; CHECK-NEXT: %26 = add <vscale x 4 x i32> %wide.strided.load, splat (i32 1)
-; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %A, i64 %22
-; CHECK-NEXT: %28 = mul i64 0, %18
-; CHECK-NEXT: %29 = sub i64 %18, 1
-; CHECK-NEXT: %30 = mul i64 -1, %29
-; CHECK-NEXT: %31 = getelementptr inbounds i32, ptr %27, i64 %28
-; CHECK-NEXT: %32 = getelementptr inbounds i32, ptr %31, i64 %30
-; CHECK-NEXT: %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %26)
-; CHECK-NEXT: store <vscale x 4 x i32> %reverse, ptr %32, align 4
+; CHECK-NEXT: %vec.ind = phi <vscale x 4 x i32>
+; CHECK-NEXT: %25 = add nsw <vscale x 4 x i32> %vec.ind, splat (i32 -1)
+; CHECK-NEXT: %26 = zext <vscale x 4 x i32> %25 to <vscale x 4 x i64>
+; CHECK-NEXT: %27 = extractelement <vscale x 4 x i64> %26, i32 0
+; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %B, i64 %27
+; CHECK-NEXT: %29 = getelementptr inbounds i32, ptr %28, i32 0
+; CHECK-NEXT: %30 = trunc i64 %18 to i32
+; CHECK-NEXT: %wide.strided.load = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 %29, i64 -4, <vscale x 4 x i1> splat (i1 true), i32 %30)
+; CHECK-NEXT: %31 = add <vscale x 4 x i32> %wide.strided.load, splat (i32 1)
+; CHECK-NEXT: %32 = extractelement <vscale x 4 x i64> %26, i32 0
+; CHECK-NEXT: %33 = getelementptr inbounds i32, ptr %A, i64 %32
+; CHECK-NEXT: %34 = mul i64 0, %18
+; CHECK-NEXT: %35 = sub i64 %18, 1
+; CHECK-NEXT: %36 = mul i64 -1, %35
+; CHECK-NEXT: %37 = getelementptr inbounds i32, ptr %33, i64 %34
+; CHECK-NEXT: %38 = getelementptr inbounds i32, ptr %37, i64 %36
+; CHECK-NEXT: %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %31)
+; CHECK-NEXT: store <vscale x 4 x i32> %reverse, ptr %38, align 4
; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %33, <null operand!>, label %vector.body
+; CHECK-NEXT: %vec.ind.next = add <vscale x 4 x i32> %vec.ind, %broadcast.splat4
+; CHECK-NEXT: %39 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1 %39, <null operand!>, label %vector.body
; CHECK-NEXT: LV: created middle.block
; CHECK-NEXT: LV: draw edge from vector.body
; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block
@@ -364,7 +374,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
-; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
+; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
; CHECK-NEXT: br label %for.body
; CHECK-NEXT: LV: draw edge from middle.block
; CHECK-NEXT: LV: draw edge from for.body.preheader
@@ -374,12 +384,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph
; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ]
+; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: %34 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT: %add9 = add i32 %34, 1
+; CHECK-NEXT: %40 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: %add9 = add i32 %40, 1
; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4
; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
@@ -432,18 +442,13 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295.
; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
@@ -480,9 +485,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: No successors
; CHECK-NEXT: }
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
@@ -505,10 +507,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1>
+; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]>
@@ -551,8 +552,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: }
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
@@ -565,27 +566,26 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV(REG): At #0 Interval # 0
; CHECK-NEXT: LV(REG): At #1 Interval # 1
; CHECK-NEXT: LV(REG): At #2 Interval # 2
-; CHECK-NEXT: LV(REG): At #3 Interval # 2
-; CHECK-NEXT: LV(REG): At #4 Interval # 2
-; CHECK-NEXT: LV(REG): At #5 Interval # 2
-; CHECK-NEXT: LV(REG): At #6 Interval # 3
-; CHECK-NEXT: LV(REG): At #7 Interval # 3
-; CHECK-NEXT: LV(REG): At #8 Interval # 3
-; CHECK-NEXT: LV(REG): At #9 Interval # 3
-; CHECK-NEXT: LV(REG): At #10 Interval # 3
+; CHECK-NEXT: LV(REG): At #3 Interval # 3
+; CHECK-NEXT: LV(REG): At #4 Interval # 3
+; CHECK-NEXT: LV(REG): At #5 Interval # 4
+; CHECK-NEXT: LV(REG): At #6 Interval # 4
+; CHECK-NEXT: LV(REG): At #7 Interval # 4
+; CHECK-NEXT: LV(REG): At #8 Interval # 4
+; CHECK-NEXT: LV(REG): At #9 Interval # 4
+; CHECK-NEXT: LV(REG): At #10 Interval # 4
; CHECK-NEXT: LV(REG): At #11 Interval # 3
-; CHECK-NEXT: LV(REG): At #12 Interval # 2
-; CHECK-NEXT: LV(REG): At #13 Interval # 2
+; CHECK-NEXT: LV(REG): At #12 Interval # 3
; CHECK-NEXT: LV(REG): VF = vscale x 4
; CHECK-NEXT: LV(REG): Found max usage: 2 item
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers
; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 25
+; CHECK-NEXT: LV: Loop cost is 29
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
; CHECK-NEXT: LV: Not Interleaving.
@@ -636,13 +636,21 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4
; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT: EMIT vp<%3> = step-vector i32
+; CHECK-NEXT: EMIT vp<%4> = broadcast ir<%n>
+; CHECK-NEXT: EMIT vp<%5> = broadcast ir<-1>
+; CHECK-NEXT: EMIT vp<%6> = mul vp<%3>, vp<%5>
+; CHECK-NEXT: EMIT vp<[[IV_START:%.+]]> = add vp<%4>, vp<%6>
+; CHECK-NEXT: EMIT-SCALAR vp<%7> = trunc ir<%18> to i32
+; CHECK-NEXT: EMIT vp<%8> = mul ir<-1>, vp<%7>
+; CHECK-NEXT: EMIT vp<[[IV_INC:%.+]]> = broadcast vp<%8>
; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
-; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: WIDEN-PHI ir<[[WIDEN_IV:%.+]]> = phi [ vp<[[IV_START]]>, ir-bb<vector.ph> ], [ vp<[[IV_NEXT:%.+]]>, vector.body ]
+; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1>
+; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]>
@@ -651,6 +659,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]>
; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1
+; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add ir<[[WIDEN_IV]]>, vp<[[IV_INC]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
; CHECK-NEXT: Successor(s): middle.block, vector.body
; CHECK-EMPTY:
@@ -727,6 +736,15 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %19 = sub i64 %0, %n.vec
; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32
; CHECK-NEXT: %20 = sub i32 %n, %.cast
+; CHECK-NEXT: %21 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT: %broadcast.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %n, i64 0
+; CHECK-NEXT: %broadcast.splat = shufflevector <vscale x 4 x i32> %broadcast.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: %22 = mul <vscale x 4 x i32> %21, splat (i32 -1)
+; CHECK-NEXT: %induction = add <vscale x 4 x i32> %broadcast.splat, %22
+; CHECK-NEXT: %23 = trunc i64 %18 to i32
+; CHECK-NEXT: %24 = mul i32 -1, %23
+; CHECK-NEXT: %broadcast.splatinsert3 = insertelement <vscale x 4 x i32> poison, i32 %24, i64 0
+; CHECK-NEXT: %broadcast.splat4 = shufflevector <vscale x 4 x i32> %broadcast.splatinsert3, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br
; CHECK-NEXT: LV: draw edge from vector.memcheck
; CHECK-NEXT: LV: created vector.body
@@ -735,26 +753,28 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph
; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ]
-; CHECK-NEXT: %.cast3 = trunc i64 %index to i32
-; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3
-; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1
-; CHECK-NEXT: %22 = zext i32 %21 to i64
-; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22
-; CHECK-NEXT: %24 = getelementptr inbounds float, ptr %23, i32 0
-; CHECK-NEXT: %25 = trunc i64 %18 to i32
-; CHECK-NEXT: %wide.strided.load = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 %24, i64 -4, <vscale x 4 x i1> splat (i1 true), i32 %25)
-; CHECK-NEXT: %26 = fadd <vscale x 4 x float> %wide.strided.load, splat (float 1.000000e+00)
-; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %A, i64 %22
-; CHECK-NEXT: %28 = mul i64 0, %18
-; CHECK-NEXT: %29 = sub i64 %18, 1
-; CHECK-NEXT: %30 = mul i64 -1, %29
-; CHECK-NEXT: %31 = getelementptr inbounds float, ptr %27, i64 %28
-; CHECK-NEXT: %32 = getelementptr inbounds float, ptr %31, i64 %30
-; CHECK-NEXT: %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %26)
-; CHECK-NEXT: store <vscale x 4 x float> %reverse, ptr %32, align 4
+; CHECK-NEXT: %vec.ind = phi <vscale x 4 x i32>
+; CHECK-NEXT: %25 = add nsw <vscale x 4 x i32> %vec.ind, splat (i32 -1)
+; CHECK-NEXT: %26 = zext <vscale x 4 x i32> %25 to <vscale x 4 x i64>
+; CHECK-NEXT: %27 = extractelement <vscale x 4 x i64> %26, i32 0
+; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %B, i64 %27
+; CHECK-NEXT: %29 = getelementptr inbounds float, ptr %28, i32 0
+; CHECK-NEXT: %30 = trunc i64 %18 to i32
+; CHECK-NEXT: %wide.strided.load = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 %29, i64 -4, <vscale x 4 x i1> splat (i1 true), i32 %30)
+; CHECK-NEXT: %31 = fadd <vscale x 4 x float> %wide.strided.load, splat (float 1.000000e+00)
+; CHECK-NEXT: %32 = extractelement <vscale x 4 x i64> %26, i32 0
+; CHECK-NEXT: %33 = getelementptr inbounds float, ptr %A, i64 %32
+; CHECK-NEXT: %34 = mul i64 0, %18
+; CHECK-NEXT: %35 = sub i64 %18, 1
+; CHECK-NEXT: %36 = mul i64 -1, %35
+; CHECK-NEXT: %37 = getelementptr inbounds float, ptr %33, i64 %34
+; CHECK-NEXT: %38 = getelementptr inbounds float, ptr %37, i64 %36
+; CHECK-NEXT: %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %31)
+; CHECK-NEXT: store <vscale x 4 x float> %reverse, ptr %38, align 4
; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %33, <null operand!>, label %vector.body
+; CHECK-NEXT: %vec.ind.next = add <vscale x 4 x i32> %vec.ind, %broadcast.splat4
+; CHECK-NEXT: %39 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1 %39, <null operand!>, label %vector.body
; CHECK-NEXT: LV: created middle.block
; CHECK-NEXT: LV: draw edge from vector.body
; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block
@@ -771,7 +791,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
-; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
+; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
; CHECK-NEXT: br label %for.body
; CHECK-NEXT: LV: draw edge from middle.block
; CHECK-NEXT: LV: draw edge from for.body.preheader
@@ -781,12 +801,12 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph
; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ]
+; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: %34 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT: %conv1 = fadd float %34, 1.000000e+00
+; CHECK-NEXT: %40 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: %conv1 = fadd float %40, 1.000000e+00
; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4
; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
>From 806d586681c7ebe9f33ca89b8e28124bd47c9893 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 20 May 2025 00:42:15 -0700
Subject: [PATCH 03/16] [WIP][VPlan Based] Generate VPWidenStrideLoadRecipe in
VPlanTransform
Still rely on CM_Strided to known legal and cost.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 52 +++++--------------
.../Transforms/Vectorize/VPlanTransforms.cpp | 49 +++++++++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 4 ++
3 files changed, 66 insertions(+), 39 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f2c742cf62927..e635256e96951 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1320,14 +1320,9 @@ class LoopVectorizationCostModel {
/// that can be vectorized.
bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
- /// Get the stride of the strided memory access instruction \p Instr. Return 0
- /// if the instruction \p Instr is not considered for vectorization as a
- /// strided memory access.
- int64_t getStride(Instruction *Instr) const {
- auto It = StrideInfo.find(Instr);
- if (It != StrideInfo.end())
- return It->second;
- return 0;
+ /// Get the stride information of the strided memory accesses.
+ SmallDenseMap<Instruction *, int64_t> getStrideInfo() const {
+ return StrideInfo;
}
/// Returns true if we're required to use a scalar epilogue for at least
@@ -1721,7 +1716,7 @@ class LoopVectorizationCostModel {
}
/// The mapping of memory access instructions to their stride values.
- DenseMap<Instruction *, int64_t> StrideInfo;
+ SmallDenseMap<Instruction *, int64_t> StrideInfo;
public:
/// The loop that we evaluate.
@@ -7826,27 +7821,16 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
// reverse consecutive.
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, Range.Start);
-
- auto SameWiden = [&](ElementCount VF) -> bool {
- return Decision == CM.getWideningDecision(I, VF);
- };
- bool ContainsWidenVF =
- LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range);
- assert(ContainsWidenVF &&
- "At least widen the memory accesses by the Start VF.");
-
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
bool Consecutive =
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
- bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
- if (Consecutive || Strided) {
+ if (Consecutive) {
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
VPSingleDefRecipe *VectorPtr;
if (Reverse) {
- assert(!Strided && "Reverse and Strided are mutually exclusive.");
// When folding the tail, we may compute an address that we don't in the
// original scalar loop and it may not be inbounds. Drop Inbounds in that
// case.
@@ -7858,30 +7842,17 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
/*Stride*/ -1, Flags, I->getDebugLoc());
} else {
- VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), Strided,
- GEP ? GEP->getNoWrapFlags()
- : GEPNoWrapFlags::none(),
- I->getDebugLoc());
+ VectorPtr = new VPVectorPointerRecipe(
+ Ptr, getLoadStoreType(I), /*Strided*/ false,
+ GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
+ I->getDebugLoc());
}
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}
- if (LoadInst *Load = dyn_cast<LoadInst>(I)) {
- if (Strided) {
- const DataLayout &DL = Load->getDataLayout();
- auto *StrideTy = DL.getIndexType(Load->getPointerOperand()->getType());
- int64_t Stride = CM.getStride(Load);
- assert(Stride == -1 &&
- "Only stride memory access with a stride of -1 is supported.");
- VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
- StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(Load))));
- return new VPWidenStridedLoadRecipe(*Load, Ptr, StrideVPV, &Plan.getVF(),
- Mask, VPIRMetadata(*Load, LVer),
- I->getDebugLoc());
- }
+ if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
VPIRMetadata(*Load, LVer), I->getDebugLoc());
- }
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
@@ -9032,6 +9003,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan,
InterleaveGroups, RecipeBuilder,
CM.isScalarEpilogueAllowed());
+ // !!! NEED COMMENT
+ VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
+ CM.getStrideInfo());
// Replace VPValues for known constant strides guaranteed by predicate scalar
// evolution.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8b359d53e3afb..d83ec28e64f78 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2691,6 +2691,55 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
R->dissolveToCFGLoop();
}
+void VPlanTransforms::convertToStridedAccesses(
+ VPlan &Plan, const SmallDenseMap<Instruction *, int64_t> &StrideInfo) {
+ // !!! FIXME: Should remove StrideInfo for next step.
+ if (Plan.hasScalarVFOnly() || StrideInfo.empty())
+ return;
+
+ // !!! FIXME: Should clamp VF for legal and cost in next step
+ SmallVector<VPRecipeBase *> ToErase;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ // !!! FIXME: Should use LoadR->isReverse() for next step
+ if (auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
+ LoadR && !LoadR->isConsecutive()) {
+ auto *LI = cast<LoadInst>(&LoadR->getIngredient());
+ auto It = StrideInfo.find(LI);
+ if (It == StrideInfo.end())
+ continue;
+ int64_t Stride = It->second;
+ assert(Stride == -1 &&
+ "Only stride memory access with a stride of -1 is supported.");
+ // !!! FIXME: Should get VPVectorEndPointerRecipe for reverse
+ VPValue *Ptr = LoadR->getAddr();
+ auto *GEP = dyn_cast<GetElementPtrInst>(
+ Ptr->getUnderlyingValue()->stripPointerCasts());
+ auto *NewPtr = new VPVectorPointerRecipe(
+ Ptr, getLoadStoreType(LI), /*Stride*/ true,
+ GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
+ LoadR->getDebugLoc());
+ NewPtr->insertBefore(LoadR);
+
+ const DataLayout &DL = LI->getDataLayout();
+ auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType());
+ VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
+ StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(LI))));
+ auto *StridedLoad = new VPWidenStridedLoadRecipe(
+ *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR,
+ LoadR->getDebugLoc());
+ StridedLoad->insertBefore(LoadR);
+ LoadR->replaceAllUsesWith(StridedLoad);
+ ToErase.push_back(LoadR);
+ }
+ }
+ }
+
+ for (VPRecipeBase *R : ToErase)
+ R->eraseFromParent();
+}
+
void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
Type &CanonicalIVTy) {
using namespace llvm::VPlanPatternMatch;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 8d2eded45da22..b863eb18a95da 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -175,6 +175,10 @@ struct VPlanTransforms {
&InterleaveGroups,
VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed);
+ // !!! NEED COMMENT
+ static void convertToStridedAccesses(
+ VPlan &Plan, const SmallDenseMap<Instruction *, int64_t> &StrideInfo);
+
/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);
>From 831c7826cd54cb37ce48b0be9e9c98678093f01a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 21 May 2025 00:10:24 -0700
Subject: [PATCH 04/16] [WIP][VPlan based] Clamp VF range in VPlan
transformation
---
.../Transforms/Vectorize/LoopVectorize.cpp | 79 +-----
llvm/lib/Transforms/Vectorize/VPlan.h | 6 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 25 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 92 +++---
.../Transforms/Vectorize/VPlanTransforms.h | 4 +-
.../RISCV/riscv-vector-reverse.ll | 268 ++++++++----------
6 files changed, 201 insertions(+), 273 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e635256e96951..5d51ead847e2d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1316,15 +1316,6 @@ class LoopVectorizationCostModel {
return InterleaveInfo.getInterleaveGroup(Instr);
}
- /// Returns true if \p I is a memory instruction with strided memory access
- /// that can be vectorized.
- bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
-
- /// Get the stride information of the strided memory accesses.
- SmallDenseMap<Instruction *, int64_t> getStrideInfo() const {
- return StrideInfo;
- }
-
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
bool requiresScalarEpilogue(bool IsVectorizing) const {
@@ -1572,10 +1563,6 @@ class LoopVectorizationCostModel {
/// element)
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
- /// The cost computation for strided load/store instruction.
- InstructionCost getStridedLoadStoreCost(Instruction *I,
- ElementCount VF) const;
-
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
InstructionCost getScalarizationOverhead(Instruction *I,
@@ -1715,9 +1702,6 @@ class LoopVectorizationCostModel {
Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
}
- /// The mapping of memory access instructions to their stride values.
- SmallDenseMap<Instruction *, int64_t> StrideInfo;
-
public:
/// The loop that we evaluate.
Loop *TheLoop;
@@ -3293,31 +3277,6 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
return true;
}
-bool LoopVectorizationCostModel::stridedAccessCanBeWidened(
- Instruction *I, ElementCount VF) const {
- // Get and ensure we have a valid memory instruction.
- assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
-
- // Only support strided access for vector VF.
- if (!VF.isVector())
- return false;
-
- // FIXME: Remove this check for StoreInst after strided store is supported.
- if (isa<StoreInst>(I))
- return false;
-
- [[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I);
- auto *ScalarTy = getLoadStoreType(I);
- // TODO: Support non-unit-reverse strided accesses. Add stride analysis here
- // to ensure that the accessed addresses are evenly spaced apart by a fixed
- // stride.
- assert(Legal->isConsecutivePtr(ScalarTy, Ptr) == -1 &&
- "Only supports strided accesses with a stride of -1");
-
- const Align Alignment = getLoadStoreAlignment(I);
- return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment);
-}
-
void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// We should not collect Uniforms more than once per VF. Right now,
// this function is called from collectUniformsAndScalars(), which
@@ -5473,19 +5432,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
return Cost;
}
-InstructionCost
-LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
- ElementCount VF) const {
- Type *ValTy = getLoadStoreType(I);
- auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
- const Align Alignment = getLoadStoreAlignment(I);
- const Value *Ptr = getLoadStorePointerOperand(I);
-
- return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
- Legal->isMaskRequired(I), Alignment,
- CostKind, I);
-}
-
std::optional<InstructionCost>
LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
ElementCount VF,
@@ -5805,17 +5751,6 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
"Expected consecutive stride.");
InstWidening Decision =
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
- // Consider using strided load/store for consecutive reverse accesses to
- // achieve more efficient memory operations.
- if (ConsecutiveStride == -1 && stridedAccessCanBeWidened(&I, VF)) {
- const InstructionCost StridedLoadStoreCost =
- getStridedLoadStoreCost(&I, VF);
- if (StridedLoadStoreCost < Cost) {
- Decision = CM_Strided;
- Cost = StridedLoadStoreCost;
- StrideInfo[&I] = ConsecutiveStride;
- }
- }
setWideningDecision(&I, VF, Decision, Cost);
continue;
}
@@ -8986,12 +8921,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// clamp the range for better cost estimation.
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
- if (!CM.foldTailWithEVL()) {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
- CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+ CM.CostKind);
+ if (!CM.foldTailWithEVL())
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
- }
+
+ // !!! NEED COMMENT
+ VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
+ CostCtx, Range);
for (ElementCount VF : Range)
Plan->addVF(VF);
@@ -9003,9 +8941,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan,
InterleaveGroups, RecipeBuilder,
CM.isScalarEpilogueAllowed());
- // !!! NEED COMMENT
- VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
- CM.getStrideInfo());
// Replace VPValues for known constant strides guaranteed by predicate scalar
// evolution.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 569869e8e4bd4..c9e51d9abaf90 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1725,6 +1725,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC)
+ VPValue *getPtr() const { return getOperand(0); }
+
VPValue *getVFValue() { return getOperand(1); }
const VPValue *getVFValue() const { return getOperand(1); }
@@ -3089,10 +3091,6 @@ struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
/// Generate a strided load.
void execute(VPTransformState &State) override;
- /// Return the cost of this VPWidenStridedLoadRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f05c5b178a3e5..8886bc8765b2e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3073,9 +3073,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
- unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
- ? Instruction::Load
- : Instruction::Store;
+ unsigned Opcode =
+ isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+ this)
+ ? Instruction::Load
+ : Instruction::Store;
if (!Consecutive) {
// TODO: Using the original IR may not be accurate.
@@ -3084,6 +3086,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
assert(!Reverse &&
"Inconsecutive memory access should not have the order.");
+
+ if (isa<VPWidenStridedLoadRecipe>(this))
+ return Ctx.TTI.getStridedMemoryOpCost(
+ Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient);
+
return Ctx.TTI.getAddressComputationCost(Ty) +
Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
Ctx.CostKind, &Ingredient);
@@ -3276,18 +3283,6 @@ void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-InstructionCost
-VPWidenStridedLoadRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
- Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
- const Align Alignment = getLoadStoreAlignment(&Ingredient);
- const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
-
- return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
- IsMasked, Alignment, Ctx.CostKind,
- &Ingredient);
-}
-
void VPWidenStoreRecipe::execute(VPTransformState &State) {
VPValue *StoredVPValue = getStoredValue();
bool CreateScatter = !isConsecutive();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d83ec28e64f78..7a67df4cd7b6e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2691,48 +2691,68 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
R->dissolveToCFGLoop();
}
-void VPlanTransforms::convertToStridedAccesses(
- VPlan &Plan, const SmallDenseMap<Instruction *, int64_t> &StrideInfo) {
- // !!! FIXME: Should remove StrideInfo for next step.
- if (Plan.hasScalarVFOnly() || StrideInfo.empty())
+void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
+ VFRange &Range) {
+ if (Plan.hasScalarVFOnly())
return;
- // !!! FIXME: Should clamp VF for legal and cost in next step
SmallVector<VPRecipeBase *> ToErase;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- // !!! FIXME: Should use LoadR->isReverse() for next step
- if (auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
- LoadR && !LoadR->isConsecutive()) {
- auto *LI = cast<LoadInst>(&LoadR->getIngredient());
- auto It = StrideInfo.find(LI);
- if (It == StrideInfo.end())
- continue;
- int64_t Stride = It->second;
- assert(Stride == -1 &&
- "Only stride memory access with a stride of -1 is supported.");
- // !!! FIXME: Should get VPVectorEndPointerRecipe for reverse
- VPValue *Ptr = LoadR->getAddr();
- auto *GEP = dyn_cast<GetElementPtrInst>(
- Ptr->getUnderlyingValue()->stripPointerCasts());
- auto *NewPtr = new VPVectorPointerRecipe(
- Ptr, getLoadStoreType(LI), /*Stride*/ true,
- GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
- LoadR->getDebugLoc());
- NewPtr->insertBefore(LoadR);
-
- const DataLayout &DL = LI->getDataLayout();
- auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType());
- VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
- StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(LI))));
- auto *StridedLoad = new VPWidenStridedLoadRecipe(
- *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR,
- LoadR->getDebugLoc());
- StridedLoad->insertBefore(LoadR);
- LoadR->replaceAllUsesWith(StridedLoad);
- ToErase.push_back(LoadR);
- }
+ auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R);
+ // TODO: support strided store
+ // TODO: support strided accesses with stride not equal to -1
+ if (!MemR || !isa<VPWidenLoadRecipe>(MemR) || !MemR->isReverse())
+ continue;
+
+ Instruction &Ingredient = MemR->getIngredient();
+ Type *ElementTy = getLoadStoreType(&Ingredient);
+
+ auto IsProfitable = [&](ElementCount VF) -> bool {
+ Type *DataTy = toVectorTy(ElementTy, VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+ if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
+ return false;
+ const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
+ const InstructionCost StridedLoadStoreCost =
+ Ctx.TTI.getStridedMemoryOpCost(
+ Ingredient.getOpcode(), DataTy,
+ getLoadStorePointerOperand(&Ingredient), MemR->isMasked(),
+ Alignment, Ctx.CostKind, &Ingredient);
+ return StridedLoadStoreCost < CurrentCost;
+ };
+
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable,
+ Range))
+ continue;
+
+ // The stride of consecutive reverse access must be -1.
+ int64_t Stride = -1;
+ auto *VecEndPtr = cast<VPVectorEndPointerRecipe>(MemR->getAddr());
+ VPValue *Ptr = VecEndPtr->getPtr();
+ auto *GEP = dyn_cast<GetElementPtrInst>(
+ Ptr->getUnderlyingValue()->stripPointerCasts());
+ // Create a new vector pointer for strided access.
+ auto *NewPtr = new VPVectorPointerRecipe(
+ Ptr, ElementTy, /*Stride=*/ true,
+ GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
+ VecEndPtr->getDebugLoc());
+ NewPtr->insertBefore(MemR);
+
+ auto *LoadR = cast<VPWidenLoadRecipe>(MemR);
+ auto *LI = cast<LoadInst>(&Ingredient);
+ const DataLayout &DL = LI->getDataLayout();
+ auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType());
+ VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
+ StrideTy, Stride * DL.getTypeAllocSize(ElementTy)));
+ auto *StridedLoad = new VPWidenStridedLoadRecipe(
+ *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR,
+ LoadR->getDebugLoc());
+ StridedLoad->insertBefore(LoadR);
+ LoadR->replaceAllUsesWith(StridedLoad);
+
+ ToErase.append({LoadR, VecEndPtr});
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index b863eb18a95da..e4feb14275d2a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -176,8 +176,8 @@ struct VPlanTransforms {
VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed);
// !!! NEED COMMENT
- static void convertToStridedAccesses(
- VPlan &Plan, const SmallDenseMap<Instruction *, int64_t> &StrideInfo);
+ static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
+ VFRange &Range);
/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 61c380ca079b9..97afa9f87ac24 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -25,15 +25,20 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295.
; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
@@ -68,6 +73,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: No successors
; CHECK-NEXT: }
; CHECK-NEXT: LV: Loop does not require scalar epilogue
+; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
@@ -90,9 +98,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]>
-; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1>
-; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64
+; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]>
@@ -135,10 +144,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: }
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
@@ -149,26 +158,27 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV(REG): At #0 Interval # 0
; CHECK-NEXT: LV(REG): At #1 Interval # 1
; CHECK-NEXT: LV(REG): At #2 Interval # 2
-; CHECK-NEXT: LV(REG): At #3 Interval # 3
-; CHECK-NEXT: LV(REG): At #4 Interval # 3
-; CHECK-NEXT: LV(REG): At #5 Interval # 4
-; CHECK-NEXT: LV(REG): At #6 Interval # 4
-; CHECK-NEXT: LV(REG): At #7 Interval # 4
-; CHECK-NEXT: LV(REG): At #8 Interval # 4
-; CHECK-NEXT: LV(REG): At #9 Interval # 4
-; CHECK-NEXT: LV(REG): At #10 Interval # 4
+; CHECK-NEXT: LV(REG): At #3 Interval # 2
+; CHECK-NEXT: LV(REG): At #4 Interval # 2
+; CHECK-NEXT: LV(REG): At #5 Interval # 2
+; CHECK-NEXT: LV(REG): At #6 Interval # 3
+; CHECK-NEXT: LV(REG): At #7 Interval # 3
+; CHECK-NEXT: LV(REG): At #8 Interval # 3
+; CHECK-NEXT: LV(REG): At #9 Interval # 3
+; CHECK-NEXT: LV(REG): At #10 Interval # 3
; CHECK-NEXT: LV(REG): At #11 Interval # 3
-; CHECK-NEXT: LV(REG): At #12 Interval # 3
+; CHECK-NEXT: LV(REG): At #12 Interval # 2
+; CHECK-NEXT: LV(REG): At #13 Interval # 2
; CHECK-NEXT: LV(REG): VF = vscale x 4
; CHECK-NEXT: LV(REG): Found max usage: 2 item
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers
+; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers
; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 27
+; CHECK-NEXT: LV: Loop cost is 24
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
; CHECK-NEXT: LV: Not Interleaving.
@@ -219,31 +229,22 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4
; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT: EMIT vp<%3> = step-vector i32
-; CHECK-NEXT: EMIT vp<%4> = broadcast ir<%n>
-; CHECK-NEXT: EMIT vp<%5> = broadcast ir<-1>
-; CHECK-NEXT: EMIT vp<%6> = mul vp<%3>, vp<%5>
-; CHECK-NEXT: EMIT vp<[[IV_START:%.+]]> = add vp<%4>, vp<%6>
-; CHECK-NEXT: EMIT-SCALAR vp<%7> = trunc ir<%18> to i32
-; CHECK-NEXT: EMIT vp<%8> = mul ir<-1>, vp<%7>
-; CHECK-NEXT: EMIT vp<[[IV_INC:%.+]]> = broadcast vp<%8>
; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
-; CHECK-NEXT: WIDEN-PHI ir<[[WIDEN_IV:%.+]]> = phi [ vp<[[IV_START]]>, ir-bb<vector.ph> ], [ vp<[[IV_NEXT:%.+]]>, vector.body ]
-; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1>
-; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64
-; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
-; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
-; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]>
-; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
-; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
-; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1
-; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add ir<[[WIDEN_IV]]>, vp<[[IV_INC]]>
-; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
+; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
+; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1
+; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
; CHECK-NEXT: Successor(s): middle.block, vector.body
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
@@ -319,15 +320,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %19 = sub i64 %0, %n.vec
; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32
; CHECK-NEXT: %20 = sub i32 %n, %.cast
-; CHECK-NEXT: %21 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT: %broadcast.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %n, i64 0
-; CHECK-NEXT: %broadcast.splat = shufflevector <vscale x 4 x i32> %broadcast.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: %22 = mul <vscale x 4 x i32> %21, splat (i32 -1)
-; CHECK-NEXT: %induction = add <vscale x 4 x i32> %broadcast.splat, %22
-; CHECK-NEXT: %23 = trunc i64 %18 to i32
-; CHECK-NEXT: %24 = mul i32 -1, %23
-; CHECK-NEXT: %broadcast.splatinsert3 = insertelement <vscale x 4 x i32> poison, i32 %24, i64 0
-; CHECK-NEXT: %broadcast.splat4 = shufflevector <vscale x 4 x i32> %broadcast.splatinsert3, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br
; CHECK-NEXT: LV: draw edge from vector.memcheck
; CHECK-NEXT: LV: created vector.body
@@ -336,28 +328,26 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph
; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ]
-; CHECK-NEXT: %vec.ind = phi <vscale x 4 x i32>
-; CHECK-NEXT: %25 = add nsw <vscale x 4 x i32> %vec.ind, splat (i32 -1)
-; CHECK-NEXT: %26 = zext <vscale x 4 x i32> %25 to <vscale x 4 x i64>
-; CHECK-NEXT: %27 = extractelement <vscale x 4 x i64> %26, i32 0
-; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %B, i64 %27
-; CHECK-NEXT: %29 = getelementptr inbounds i32, ptr %28, i32 0
-; CHECK-NEXT: %30 = trunc i64 %18 to i32
-; CHECK-NEXT: %wide.strided.load = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 %29, i64 -4, <vscale x 4 x i1> splat (i1 true), i32 %30)
-; CHECK-NEXT: %31 = add <vscale x 4 x i32> %wide.strided.load, splat (i32 1)
-; CHECK-NEXT: %32 = extractelement <vscale x 4 x i64> %26, i32 0
-; CHECK-NEXT: %33 = getelementptr inbounds i32, ptr %A, i64 %32
-; CHECK-NEXT: %34 = mul i64 0, %18
-; CHECK-NEXT: %35 = sub i64 %18, 1
-; CHECK-NEXT: %36 = mul i64 -1, %35
-; CHECK-NEXT: %37 = getelementptr inbounds i32, ptr %33, i64 %34
-; CHECK-NEXT: %38 = getelementptr inbounds i32, ptr %37, i64 %36
-; CHECK-NEXT: %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %31)
-; CHECK-NEXT: store <vscale x 4 x i32> %reverse, ptr %38, align 4
+; CHECK-NEXT: %.cast3 = trunc i64 %index to i32
+; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3
+; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1
+; CHECK-NEXT: %22 = zext i32 %21 to i64
+; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22
+; CHECK-NEXT: %24 = getelementptr inbounds i32, ptr %23, i32 0
+; CHECK-NEXT: %25 = trunc i64 %18 to i32
+; CHECK-NEXT: %wide.strided.load = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 %24, i64 -4, <vscale x 4 x i1> splat (i1 true), i32 %25)
+; CHECK-NEXT: %26 = add <vscale x 4 x i32> %wide.strided.load, splat (i32 1)
+; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %A, i64 %22
+; CHECK-NEXT: %28 = mul i64 0, %18
+; CHECK-NEXT: %29 = sub i64 %18, 1
+; CHECK-NEXT: %30 = mul i64 -1, %29
+; CHECK-NEXT: %31 = getelementptr inbounds i32, ptr %27, i64 %28
+; CHECK-NEXT: %32 = getelementptr inbounds i32, ptr %31, i64 %30
+; CHECK-NEXT: %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %26)
+; CHECK-NEXT: store <vscale x 4 x i32> %reverse, ptr %32, align 4
; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %vec.ind.next = add <vscale x 4 x i32> %vec.ind, %broadcast.splat4
-; CHECK-NEXT: %39 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %39, <null operand!>, label %vector.body
+; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1 %33, <null operand!>, label %vector.body
; CHECK-NEXT: LV: created middle.block
; CHECK-NEXT: LV: draw edge from vector.body
; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block
@@ -374,7 +364,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
-; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
+; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
; CHECK-NEXT: br label %for.body
; CHECK-NEXT: LV: draw edge from middle.block
; CHECK-NEXT: LV: draw edge from for.body.preheader
@@ -384,12 +374,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph
; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
+; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ]
; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: %40 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT: %add9 = add i32 %40, 1
+; CHECK-NEXT: %34 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: %add9 = add i32 %34, 1
; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4
; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
@@ -442,15 +432,20 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295.
; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
@@ -485,6 +480,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: No successors
; CHECK-NEXT: }
; CHECK-NEXT: LV: Loop does not require scalar epilogue
+; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
@@ -507,9 +505,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]>
-; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1>
-; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64
+; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]>
@@ -552,10 +551,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: }
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
@@ -566,26 +565,27 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV(REG): At #0 Interval # 0
; CHECK-NEXT: LV(REG): At #1 Interval # 1
; CHECK-NEXT: LV(REG): At #2 Interval # 2
-; CHECK-NEXT: LV(REG): At #3 Interval # 3
-; CHECK-NEXT: LV(REG): At #4 Interval # 3
-; CHECK-NEXT: LV(REG): At #5 Interval # 4
-; CHECK-NEXT: LV(REG): At #6 Interval # 4
-; CHECK-NEXT: LV(REG): At #7 Interval # 4
-; CHECK-NEXT: LV(REG): At #8 Interval # 4
-; CHECK-NEXT: LV(REG): At #9 Interval # 4
-; CHECK-NEXT: LV(REG): At #10 Interval # 4
+; CHECK-NEXT: LV(REG): At #3 Interval # 2
+; CHECK-NEXT: LV(REG): At #4 Interval # 2
+; CHECK-NEXT: LV(REG): At #5 Interval # 2
+; CHECK-NEXT: LV(REG): At #6 Interval # 3
+; CHECK-NEXT: LV(REG): At #7 Interval # 3
+; CHECK-NEXT: LV(REG): At #8 Interval # 3
+; CHECK-NEXT: LV(REG): At #9 Interval # 3
+; CHECK-NEXT: LV(REG): At #10 Interval # 3
; CHECK-NEXT: LV(REG): At #11 Interval # 3
-; CHECK-NEXT: LV(REG): At #12 Interval # 3
+; CHECK-NEXT: LV(REG): At #12 Interval # 2
+; CHECK-NEXT: LV(REG): At #13 Interval # 2
; CHECK-NEXT: LV(REG): VF = vscale x 4
; CHECK-NEXT: LV(REG): Found max usage: 2 item
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers
+; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers
; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 29
+; CHECK-NEXT: LV: Loop cost is 26
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
; CHECK-NEXT: LV: Not Interleaving.
@@ -636,21 +636,13 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4
; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT: EMIT vp<%3> = step-vector i32
-; CHECK-NEXT: EMIT vp<%4> = broadcast ir<%n>
-; CHECK-NEXT: EMIT vp<%5> = broadcast ir<-1>
-; CHECK-NEXT: EMIT vp<%6> = mul vp<%3>, vp<%5>
-; CHECK-NEXT: EMIT vp<[[IV_START:%.+]]> = add vp<%4>, vp<%6>
-; CHECK-NEXT: EMIT-SCALAR vp<%7> = trunc ir<%18> to i32
-; CHECK-NEXT: EMIT vp<%8> = mul ir<-1>, vp<%7>
-; CHECK-NEXT: EMIT vp<[[IV_INC:%.+]]> = broadcast vp<%8>
; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
-; CHECK-NEXT: WIDEN-PHI ir<[[WIDEN_IV:%.+]]> = phi [ vp<[[IV_START]]>, ir-bb<vector.ph> ], [ vp<[[IV_NEXT:%.+]]>, vector.body ]
-; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1>
-; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64
+; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]>
@@ -659,7 +651,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]>
; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1
-; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add ir<[[WIDEN_IV]]>, vp<[[IV_INC]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
; CHECK-NEXT: Successor(s): middle.block, vector.body
; CHECK-EMPTY:
@@ -736,15 +727,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %19 = sub i64 %0, %n.vec
; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32
; CHECK-NEXT: %20 = sub i32 %n, %.cast
-; CHECK-NEXT: %21 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT: %broadcast.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %n, i64 0
-; CHECK-NEXT: %broadcast.splat = shufflevector <vscale x 4 x i32> %broadcast.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: %22 = mul <vscale x 4 x i32> %21, splat (i32 -1)
-; CHECK-NEXT: %induction = add <vscale x 4 x i32> %broadcast.splat, %22
-; CHECK-NEXT: %23 = trunc i64 %18 to i32
-; CHECK-NEXT: %24 = mul i32 -1, %23
-; CHECK-NEXT: %broadcast.splatinsert3 = insertelement <vscale x 4 x i32> poison, i32 %24, i64 0
-; CHECK-NEXT: %broadcast.splat4 = shufflevector <vscale x 4 x i32> %broadcast.splatinsert3, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br
; CHECK-NEXT: LV: draw edge from vector.memcheck
; CHECK-NEXT: LV: created vector.body
@@ -753,28 +735,26 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph
; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ]
-; CHECK-NEXT: %vec.ind = phi <vscale x 4 x i32>
-; CHECK-NEXT: %25 = add nsw <vscale x 4 x i32> %vec.ind, splat (i32 -1)
-; CHECK-NEXT: %26 = zext <vscale x 4 x i32> %25 to <vscale x 4 x i64>
-; CHECK-NEXT: %27 = extractelement <vscale x 4 x i64> %26, i32 0
-; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %B, i64 %27
-; CHECK-NEXT: %29 = getelementptr inbounds float, ptr %28, i32 0
-; CHECK-NEXT: %30 = trunc i64 %18 to i32
-; CHECK-NEXT: %wide.strided.load = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 %29, i64 -4, <vscale x 4 x i1> splat (i1 true), i32 %30)
-; CHECK-NEXT: %31 = fadd <vscale x 4 x float> %wide.strided.load, splat (float 1.000000e+00)
-; CHECK-NEXT: %32 = extractelement <vscale x 4 x i64> %26, i32 0
-; CHECK-NEXT: %33 = getelementptr inbounds float, ptr %A, i64 %32
-; CHECK-NEXT: %34 = mul i64 0, %18
-; CHECK-NEXT: %35 = sub i64 %18, 1
-; CHECK-NEXT: %36 = mul i64 -1, %35
-; CHECK-NEXT: %37 = getelementptr inbounds float, ptr %33, i64 %34
-; CHECK-NEXT: %38 = getelementptr inbounds float, ptr %37, i64 %36
-; CHECK-NEXT: %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %31)
-; CHECK-NEXT: store <vscale x 4 x float> %reverse, ptr %38, align 4
+; CHECK-NEXT: %.cast3 = trunc i64 %index to i32
+; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3
+; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1
+; CHECK-NEXT: %22 = zext i32 %21 to i64
+; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22
+; CHECK-NEXT: %24 = getelementptr inbounds float, ptr %23, i32 0
+; CHECK-NEXT: %25 = trunc i64 %18 to i32
+; CHECK-NEXT: %wide.strided.load = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 %24, i64 -4, <vscale x 4 x i1> splat (i1 true), i32 %25)
+; CHECK-NEXT: %26 = fadd <vscale x 4 x float> %wide.strided.load, splat (float 1.000000e+00)
+; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %A, i64 %22
+; CHECK-NEXT: %28 = mul i64 0, %18
+; CHECK-NEXT: %29 = sub i64 %18, 1
+; CHECK-NEXT: %30 = mul i64 -1, %29
+; CHECK-NEXT: %31 = getelementptr inbounds float, ptr %27, i64 %28
+; CHECK-NEXT: %32 = getelementptr inbounds float, ptr %31, i64 %30
+; CHECK-NEXT: %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %26)
+; CHECK-NEXT: store <vscale x 4 x float> %reverse, ptr %32, align 4
; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %vec.ind.next = add <vscale x 4 x i32> %vec.ind, %broadcast.splat4
-; CHECK-NEXT: %39 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %39, <null operand!>, label %vector.body
+; CHECK-NEXT: %33 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1 %33, <null operand!>, label %vector.body
; CHECK-NEXT: LV: created middle.block
; CHECK-NEXT: LV: draw edge from vector.body
; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block
@@ -791,7 +771,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
-; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
+; CHECK-NEXT: %bc.resume.val4 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
; CHECK-NEXT: br label %for.body
; CHECK-NEXT: LV: draw edge from middle.block
; CHECK-NEXT: LV: draw edge from for.body.preheader
@@ -801,12 +781,12 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: filled BB:
; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph
; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
+; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val4, %scalar.ph ], [ %i.0, %for.body ]
; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: %40 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT: %conv1 = fadd float %40, 1.000000e+00
+; CHECK-NEXT: %34 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: %conv1 = fadd float %34, 1.000000e+00
; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4
; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
>From 32ebb119043e573f083d7dd4d82d6e60c2ad98de Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 21 May 2025 00:15:28 -0700
Subject: [PATCH 05/16] [WIP][VPlan based] Time to remove CM_Strided
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 ---
1 file changed, 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5d51ead847e2d..8a3a84431c6bf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1085,7 +1085,6 @@ class LoopVectorizationCostModel {
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
- CM_Strided,
CM_Scalarize,
CM_VectorCall,
CM_IntrinsicCall
@@ -6397,8 +6396,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return TTI::CastContextHint::Normal;
switch (getWideningDecision(I, VF)) {
- // TODO: New CastContextHint for strided accesses.
- case LoopVectorizationCostModel::CM_Strided:
case LoopVectorizationCostModel::CM_GatherScatter:
return TTI::CastContextHint::GatherScatter;
case LoopVectorizationCostModel::CM_Interleave:
>From 96da92eb865e79b701e695341a3df2fef607ff19 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 21 May 2025 00:58:56 -0700
Subject: [PATCH 06/16] [VPlan based] Patch comments, nfc
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 ++++---
llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 4 +++-
2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8a3a84431c6bf..d2d44e170fde3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8914,17 +8914,18 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+ CM.CostKind);
// Transform recipes to abstract recipes if it is legal and beneficial and
// clamp the range for better cost estimation.
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
- CM.CostKind);
if (!CM.foldTailWithEVL())
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
- // !!! NEED COMMENT
+ // Convert reverse memory recipes to strided access recipes if the strided
+ // access is legal and profitable.
VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
CostCtx, Range);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index e4feb14275d2a..1f0404b63248d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -175,7 +175,9 @@ struct VPlanTransforms {
&InterleaveGroups,
VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed);
- // !!! NEED COMMENT
+ /// Transform reverse memory recipes into strided access recipes when legal
+ /// and profitable. Clamps \p Range to maintain consistency with widen
+ /// decisions of \p Plan, and uses \p Ctx to evaluate the cost.
static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
VFRange &Range);
>From d340701cb7c0c0d21b3dcc84be206a6b345f2a50 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 21 May 2025 01:21:22 -0700
Subject: [PATCH 07/16] Format
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7a67df4cd7b6e..a452e65d34b0f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2734,18 +2734,18 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
// Create a new vector pointer for strided access.
- auto *NewPtr = new VPVectorPointerRecipe(
- Ptr, ElementTy, /*Stride=*/ true,
- GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
- VecEndPtr->getDebugLoc());
+ auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, /*Stride=*/true,
+ GEP ? GEP->getNoWrapFlags()
+ : GEPNoWrapFlags::none(),
+ VecEndPtr->getDebugLoc());
NewPtr->insertBefore(MemR);
auto *LoadR = cast<VPWidenLoadRecipe>(MemR);
auto *LI = cast<LoadInst>(&Ingredient);
const DataLayout &DL = LI->getDataLayout();
auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType());
- VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
- StrideTy, Stride * DL.getTypeAllocSize(ElementTy)));
+ VPValue *StrideVPV = Plan.getOrAddLiveIn(
+ ConstantInt::get(StrideTy, Stride * DL.getTypeAllocSize(ElementTy)));
auto *StridedLoad = new VPWidenStridedLoadRecipe(
*LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR,
LoadR->getDebugLoc());
>From 1add5bc2d39b3b575c2e69abb17bd838af8f313b Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 2 Jun 2025 02:27:25 -0700
Subject: [PATCH 08/16] [Unrelated code] Remove
Mel-Chen:legalizeAndOptimizeInductions
We should reopen it after supporting const strided accessess.
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 21 +++++++------------
1 file changed, 8 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a452e65d34b0f..b9c93f39da7e8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -627,14 +627,12 @@ static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
static void legalizeAndOptimizeInductions(VPlan &Plan) {
using namespace llvm::VPlanPatternMatch;
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
- SmallVector<VPWidenInductionRecipe *, 4> InductionPhis;
- for (VPRecipeBase &R : HeaderVPBB->phis())
- if (auto *IV = dyn_cast<VPWidenInductionRecipe>(&R))
- InductionPhis.push_back(IV);
-
bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
- VPBuilder Builder;
- for (VPWidenInductionRecipe *PhiR : reverse(InductionPhis)) {
+ VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
+ for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
+ auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
+ if (!PhiR)
+ continue;
// Try to narrow wide and replicating recipes to uniform recipes, based on
// VPlan analysis.
// TODO: Apply to all recipes in the future, to replace legacy uniformity
@@ -644,8 +642,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
auto *Def = dyn_cast<VPSingleDefRecipe>(U);
auto *RepR = dyn_cast<VPReplicateRecipe>(U);
// Skip recipes that shouldn't be narrowed.
- if (!Def ||
- !isa<VPReplicateRecipe, VPWidenRecipe, VPWidenGEPRecipe>(Def) ||
+ if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
(RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
continue;
@@ -658,13 +655,11 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
Def->operands(), /*IsUniform*/ true);
Clone->insertAfter(Def);
Def->replaceAllUsesWith(Clone);
- Def->eraseFromParent();
}
- Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
// Replace wide pointer inductions which have only their scalars used by
// PtrAdd(IndStart, ScalarIVSteps (0, Step)).
- if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(PhiR)) {
+ if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
continue;
@@ -685,7 +680,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
// Replace widened induction with scalar steps for users that only use
// scalars.
- auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(PhiR);
+ auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
return U->usesScalars(WideIV);
}))
>From 78fc57e0d6ed664e3da2025f89842f20710af612 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 2 Jun 2025 02:41:39 -0700
Subject: [PATCH 09/16] [Fix] Remove unused debug info setting.
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8886bc8765b2e..c992d570a3c6e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3247,7 +3247,6 @@ void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
- State.setDebugLocFrom(getDebugLoc());
Value *Addr = State.get(getAddr(), /*IsScalar*/ true);
Value *Stride = State.get(getStride(), /*IsScalar*/ true);
Value *Mask = nullptr;
>From 0f8cfb406ed81c906bdf0738e8603e1b91dca90f Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 2 Jun 2025 02:49:02 -0700
Subject: [PATCH 10/16] [Fix] Set Opcode as Instruction::Load directly.
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b9c93f39da7e8..d5bc733b0e669 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2712,7 +2712,7 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
const InstructionCost StridedLoadStoreCost =
Ctx.TTI.getStridedMemoryOpCost(
- Ingredient.getOpcode(), DataTy,
+ Instruction::Load, DataTy,
getLoadStorePointerOperand(&Ingredient), MemR->isMasked(),
Alignment, Ctx.CostKind, &Ingredient);
return StridedLoadStoreCost < CurrentCost;
>From e27b3da630b259275e83af4785fc63b08a33154e Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 2 Jun 2025 06:13:36 -0700
Subject: [PATCH 11/16] [Fix] Replace getLoadStorePointerOperand with
Ptr->getUnderlyingValue().
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 20 +++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d5bc733b0e669..0f2e88362432c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2701,6 +2701,14 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
if (!MemR || !isa<VPWidenLoadRecipe>(MemR) || !MemR->isReverse())
continue;
+ auto *VecEndPtr = cast<VPVectorEndPointerRecipe>(MemR->getAddr());
+ VPValue *Ptr = VecEndPtr->getPtr();
+ Value *PtrUV = Ptr->getUnderlyingValue();
+ // Memory cost model requires the pointer operand of memory access
+ // instruction.
+ if (!PtrUV)
+ continue;
+
Instruction &Ingredient = MemR->getIngredient();
Type *ElementTy = getLoadStoreType(&Ingredient);
@@ -2711,10 +2719,9 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
return false;
const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
const InstructionCost StridedLoadStoreCost =
- Ctx.TTI.getStridedMemoryOpCost(
- Instruction::Load, DataTy,
- getLoadStorePointerOperand(&Ingredient), MemR->isMasked(),
- Alignment, Ctx.CostKind, &Ingredient);
+ Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, DataTy, PtrUV,
+ MemR->isMasked(), Alignment,
+ Ctx.CostKind, &Ingredient);
return StridedLoadStoreCost < CurrentCost;
};
@@ -2724,10 +2731,7 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
// The stride of consecutive reverse access must be -1.
int64_t Stride = -1;
- auto *VecEndPtr = cast<VPVectorEndPointerRecipe>(MemR->getAddr());
- VPValue *Ptr = VecEndPtr->getPtr();
- auto *GEP = dyn_cast<GetElementPtrInst>(
- Ptr->getUnderlyingValue()->stripPointerCasts());
+ auto *GEP = dyn_cast<GetElementPtrInst>(PtrUV->stripPointerCasts());
// Create a new vector pointer for strided access.
auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, /*Stride=*/true,
GEP ? GEP->getNoWrapFlags()
>From f6a722f5827a90676904c4c54e10ef4d235c463d Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 11 Jun 2025 00:42:09 -0700
Subject: [PATCH 12/16] [Fix] Pass stride in element size
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 ++++-
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 ++--
.../LoopVectorize/RISCV/riscv-vector-reverse.ll | 8 ++++----
3 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c992d570a3c6e..0d1209004dfad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3259,9 +3259,12 @@ void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
auto *PtrTy = Addr->getType();
auto *StrideTy = Stride->getType();
+ const DataLayout &DL = Ingredient.getDataLayout();
+ Value *StrideInBytes = Builder.CreateMul(
+ Stride, ConstantInt::get(StrideTy, DL.getTypeAllocSize(ScalarDataTy)));
CallInst *NewLI = Builder.CreateIntrinsic(
Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
- {Addr, Stride, Mask, RunTimeVF}, nullptr, "wide.strided.load");
+ {Addr, StrideInBytes, Mask, RunTimeVF}, nullptr, "wide.strided.load");
NewLI->addParamAttr(
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
applyMetadata(*NewLI);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 0f2e88362432c..1fe2f58f80789 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2743,8 +2743,8 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
auto *LI = cast<LoadInst>(&Ingredient);
const DataLayout &DL = LI->getDataLayout();
auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType());
- VPValue *StrideVPV = Plan.getOrAddLiveIn(
- ConstantInt::get(StrideTy, Stride * DL.getTypeAllocSize(ElementTy)));
+ VPValue *StrideVPV =
+ Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, Stride));
auto *StridedLoad = new VPWidenStridedLoadRecipe(
*LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR,
LoadR->getDebugLoc());
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 97afa9f87ac24..31681824624be 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -104,7 +104,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
-; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = vp<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]>
@@ -238,7 +238,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
-; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = ir<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]>
@@ -511,7 +511,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
-; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = vp<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]>
@@ -645,7 +645,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
-; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = ir<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]>
>From fe13002cae3582a3a9adab820baef487fd79354c Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 11 Jun 2025 03:51:39 -0700
Subject: [PATCH 13/16] [Fix] New operand Stride for VPVectorPointerRecipe
---
.../Transforms/Vectorize/LoopVectorize.cpp | 11 +++++++----
llvm/lib/Transforms/Vectorize/VPlan.h | 19 ++++++++-----------
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 18 +++++++++++-------
.../Transforms/Vectorize/VPlanTransforms.cpp | 15 +++++++--------
.../RISCV/riscv-vector-reverse.ll | 8 ++++----
.../LoopVectorize/vplan-dot-printing.ll | 4 ++--
6 files changed, 39 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d2d44e170fde3..3910169a346c0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7774,10 +7774,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
/*Stride*/ -1, Flags, I->getDebugLoc());
} else {
- VectorPtr = new VPVectorPointerRecipe(
- Ptr, getLoadStoreType(I), /*Strided*/ false,
- GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
- I->getDebugLoc());
+ const DataLayout &DL = I->getDataLayout();
+ auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType());
+ VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1));
+ VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne,
+ GEP ? GEP->getNoWrapFlags()
+ : GEPNoWrapFlags::none(),
+ I->getDebugLoc());
}
Builder.insert(VectorPtr);
Ptr = VectorPtr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c9e51d9abaf90..b012c0149b39a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1767,24 +1767,21 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
};
/// A recipe to compute the pointers for widened memory accesses of IndexTy.
-/// Supports both consecutive and reverse consecutive accesses.
-/// TODO: Support non-unit strided accesses .
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
- public VPUnrollPartAccessor<1> {
+ public VPUnrollPartAccessor<2> {
Type *IndexedTy;
- /// Indicate whether to compute the pointer for strided memory accesses.
- bool Strided;
-
public:
- VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided,
+ VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, VPValue *Stride,
GEPNoWrapFlags GEPFlags, DebugLoc DL)
- : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
- GEPFlags, DL),
- IndexedTy(IndexedTy), Strided(Strided) {}
+ : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC,
+ ArrayRef<VPValue *>({Ptr, Stride}), GEPFlags, DL),
+ IndexedTy(IndexedTy) {}
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
+ VPValue *getStride() const { return getOperand(1); }
+
void execute(VPTransformState &State) override;
bool onlyFirstLaneUsed(const VPValue *Op) const override {
@@ -1802,7 +1799,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
}
VPVectorPointerRecipe *clone() override {
- return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided,
+ return new VPVectorPointerRecipe(getOperand(0), IndexedTy, getOperand(1),
getGEPNoWrapFlags(), getDebugLoc());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0d1209004dfad..53dab59316126 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2384,16 +2384,20 @@ void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
void VPVectorPointerRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
unsigned CurrentPart = getUnrollPart(*this);
- Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
- /*IsUnitStride*/ true, CurrentPart, Builder);
+ Value *Stride = State.get(getStride(), /*IsScalar*/ true);
+
+ auto *StrideC = dyn_cast<ConstantInt>(Stride);
+ bool IsStrideOne = StrideC && StrideC->isOne();
+ bool IsUnitStride = IsStrideOne || (StrideC && StrideC->isMinusOne());
+ Type *IndexTy =
+ getGEPIndexTy(State.VF.isScalable(),
+ /*IsReverse*/ false, IsUnitStride, CurrentPart, Builder);
Value *Ptr = State.get(getOperand(0), VPLane(0));
+ Stride = Builder.CreateSExtOrTrunc(Stride, IndexTy);
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
- // TODO: Support non-unit-reverse strided accesses.
- Value *Index =
- Strided
- ? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1))
- : Increment;
+ Value *Index = IsStrideOne ? Increment : Builder.CreateMul(Increment, Stride);
+
Value *ResultPtr =
Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1fe2f58f80789..948145e632afd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2732,22 +2732,21 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
// The stride of consecutive reverse access must be -1.
int64_t Stride = -1;
auto *GEP = dyn_cast<GetElementPtrInst>(PtrUV->stripPointerCasts());
+ const DataLayout &DL = Ingredient.getDataLayout();
+ auto *StrideTy = DL.getIndexType(PtrUV->getType());
+ VPValue *StrideVPV =
+ Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, Stride));
// Create a new vector pointer for strided access.
- auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, /*Stride=*/true,
+ auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, StrideVPV,
GEP ? GEP->getNoWrapFlags()
: GEPNoWrapFlags::none(),
VecEndPtr->getDebugLoc());
NewPtr->insertBefore(MemR);
auto *LoadR = cast<VPWidenLoadRecipe>(MemR);
- auto *LI = cast<LoadInst>(&Ingredient);
- const DataLayout &DL = LI->getDataLayout();
- auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType());
- VPValue *StrideVPV =
- Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, Stride));
auto *StridedLoad = new VPWidenStridedLoadRecipe(
- *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR,
- LoadR->getDebugLoc());
+ *cast<LoadInst>(&Ingredient), NewPtr, StrideVPV, &Plan.getVF(),
+ LoadR->getMask(), *LoadR, LoadR->getDebugLoc());
StridedLoad->insertBefore(LoadR);
LoadR->replaceAllUsesWith(StridedLoad);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 31681824624be..ea193aff5593b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -103,7 +103,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
-; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1>
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = vp<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
@@ -237,7 +237,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1>
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
-; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1>
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = ir<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
@@ -510,7 +510,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
-; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1>
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = vp<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
@@ -644,7 +644,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1>
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
-; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>, ir<-1>
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-1>, runtimeVF = ir<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
index 528f2448616e8..2c757021e76ff 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
@@ -42,11 +42,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
; CHECK-NEXT: " EMIT vp\<[[CAN_IV:%.+]]\> = CANONICAL-INDUCTION ir\<0\>, vp\<[[CAN_IV_NEXT:%.+]]\>\l" +
; CHECK-NEXT: " vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<1\>, vp\<[[VF]]\>\l" +
; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr inbounds ir\<%y\>, vp\<[[STEPS]]\>\l" +
-; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>\l" +
+; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>, ir\<1\>\l" +
; CHECK-NEXT: " WIDEN ir\<%lv\> = load vp\<[[VEC_PTR]]\>\l" +
; CHECK-NEXT: " WIDEN-INTRINSIC ir\<%call\> = call llvm.sqrt(ir\<%lv\>)\l" +
; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr inbounds ir\<%x\>, vp\<[[STEPS]]\>\l" +
-; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>\l" +
+; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>, ir\<1\>\l" +
; CHECK-NEXT: " WIDEN store vp\<[[VEC_PTR2]]\>, ir\<%call\>\l" +
; CHECK-NEXT: " EMIT vp\<[[CAN_IV_NEXT]]\> = add nuw vp\<[[CAN_IV]]\>, vp\<[[VFxUF]]\>\l" +
; CHECK-NEXT: " EMIT branch-on-count vp\<[[CAN_IV_NEXT]]\>, vp\<[[VEC_TC]]\>\l" +
>From d4ea1d663270b07f5621a1b96c6a6591d06d475f Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 16 Jun 2025 02:17:19 -0700
Subject: [PATCH 14/16] [Comment] Remove the unrelated change, nfc
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 948145e632afd..5c1dfeea81205 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -633,6 +633,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
if (!PhiR)
continue;
+
// Try to narrow wide and replicating recipes to uniform recipes, based on
// VPlan analysis.
// TODO: Apply to all recipes in the future, to replace legacy uniformity
>From bb03212376c41e3a34ee1fb2424b0b5f2e56a853 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 16 Jun 2025 02:25:50 -0700
Subject: [PATCH 15/16] [Comment] Add assert for consecutive, nfc
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 5c1dfeea81205..89573b9f18033 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2702,6 +2702,8 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
if (!MemR || !isa<VPWidenLoadRecipe>(MemR) || !MemR->isReverse())
continue;
+ assert(MemR->isConsecutive() && "Reverse access must be consecutive");
+
auto *VecEndPtr = cast<VPVectorEndPointerRecipe>(MemR->getAddr());
VPValue *Ptr = VecEndPtr->getPtr();
Value *PtrUV = Ptr->getUnderlyingValue();
>From d9e53618e4c97a46c632c44f084ff79d60352e64 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 16 Jun 2025 03:06:21 -0700
Subject: [PATCH 16/16] [Comment] Update comment of VPVectorPointerRecipe
---
llvm/lib/Transforms/Vectorize/VPlan.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index b012c0149b39a..7ca9eedc46cf0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1766,7 +1766,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
#endif
};
-/// A recipe to compute the pointers for widened memory accesses of IndexTy.
+/// A recipe to compute the pointers for widened memory accesses of IndexedTy,
+/// with the Stride expressed in units of IndexedTy.
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<2> {
Type *IndexedTy;
More information about the llvm-commits
mailing list