[llvm] [LV] Convert gather loads with invariant stride into strided loads (PR #147297)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 9 04:44:44 PST 2025
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/147297
>From 3121bb9cd99163139d9e391a6703f057fcc47a2f Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 30 Jun 2025 19:01:57 -0700
Subject: [PATCH 01/25] New VPWidenStridedLoadRecipe
---
.../Transforms/Vectorize/LoopVectorize.cpp | 3 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 50 +++++++++++++++-
.../Transforms/Vectorize/VPlanAnalysis.cpp | 6 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 57 +++++++++++++++++--
.../Transforms/Vectorize/VPlanTransforms.cpp | 14 +++--
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 +
.../Transforms/Vectorize/VPlanVerifier.cpp | 3 +-
7 files changed, 119 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 15d0fa41bd902..0d6cd9bb0de29 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4021,7 +4021,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
[](const auto *R) { return Instruction::Select; })
.Case<VPWidenStoreRecipe>(
[](const auto *R) { return Instruction::Store; })
- .Case<VPWidenLoadRecipe>(
+ .Case<VPWidenLoadRecipe, VPWidenStridedLoadRecipe>(
[](const auto *R) { return Instruction::Load; })
.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
[](const auto *R) { return Instruction::Call; })
@@ -4124,6 +4124,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPReductionPHISC:
case VPDef::VPInterleaveEVLSC:
case VPDef::VPInterleaveSC:
+ case VPDef::VPWidenStridedLoadSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
case VPDef::VPWidenStoreEVLSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 972823137023f..dae37490a720f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -572,6 +572,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInterleaveEVLSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
+ case VPRecipeBase::VPWidenStridedLoadSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
case VPRecipeBase::VPWidenStoreEVLSC:
@@ -3288,7 +3289,8 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
- R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
+ R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC;
}
static inline bool classof(const VPUser *U) {
@@ -3414,6 +3416,52 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
#endif
};
+/// A recipe for strided load operations, using the base address, stride, and an
+/// optional mask. This recipe will generate an vp.strided.load intrinsic call
+/// to represent memory accesses with a fixed stride.
+struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
+ public VPValue {
+ VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride,
+ VPValue *VF, VPValue *Mask,
+ const VPIRMetadata &Metadata, DebugLoc DL)
+ : VPWidenMemoryRecipe(
+ VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF},
+ /*Consecutive=*/false, /*Reverse=*/false, Metadata, DL),
+ VPValue(this, &Load) {
+ setMask(Mask);
+ }
+
+ VPWidenStridedLoadRecipe *clone() override {
+ return new VPWidenStridedLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
+ getStride(), getVF(), getMask(), *this,
+ getDebugLoc());
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC);
+
+ /// Return the stride operand.
+ VPValue *getStride() const { return getOperand(1); }
+
+ /// Return the VF operand.
+ VPValue *getVF() const { return getOperand(2); }
+
+ /// Generate a strided load.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getAddr() || Op == getStride() || Op == getVF();
+ }
+};
+
/// A recipe for widening store operations, using the stored value, the address
/// to store to and an optional mask.
struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index c64b97579881a..f44b6ccf8c35a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -194,8 +194,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
- assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
- "Store recipes should not define any values");
+ assert(
+ (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+ R)) &&
+ "Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 519a104b9484f..333d490f66ef0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -88,6 +88,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenCastSC:
case VPWidenGEPSC:
case VPWidenIntOrFpInductionSC:
+ case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenPHISC:
@@ -112,6 +113,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
+ case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
return true;
@@ -201,6 +203,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPInterleaveEVLSC:
case VPInterleaveSC:
return mayWriteToMemory();
+ case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenStoreEVLSC:
@@ -3475,9 +3478,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
- unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
- ? Instruction::Load
- : Instruction::Store;
+ unsigned Opcode =
+ isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+ this)
+ ? Instruction::Load
+ : Instruction::Store;
if (!Consecutive) {
// TODO: Using the original IR may not be accurate.
@@ -3487,8 +3492,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
"Inconsecutive memory access should not have the order.");
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
- Type *PtrTy = Ptr->getType();
+ if (isa<VPWidenStridedLoadRecipe>(this))
+ return Ctx.TTI.getStridedMemoryOpCost(
+ Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient);
+ Type *PtrTy = Ptr->getType();
// If the address value is uniform across all lanes, then the address can be
// calculated with scalar type and broadcast.
if (!vputils::isSingleScalar(getAddr()))
@@ -3650,6 +3658,47 @@ void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+ auto &Builder = State.Builder;
+ Value *Addr = State.get(getAddr(), /*IsScalar*/ true);
+ Value *StrideInBytes = State.get(getStride(), /*IsScalar*/ true);
+ Value *Mask = nullptr;
+ if (VPValue *VPMask = getMask())
+ Mask = State.get(VPMask);
+ else
+ Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+ Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)),
+ Builder.getInt32Ty());
+
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = StrideInBytes->getType();
+ CallInst *NewLI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+ {Addr, StrideInBytes, Mask, RunTimeVF}, nullptr, "wide.strided.load");
+ NewLI->addParamAttr(
+ 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+ applyMetadata(*NewLI);
+ State.set(this, NewLI);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN ";
+ printAsOperand(O, SlotTracker);
+ O << " = load ";
+ getAddr()->printAsOperand(O, SlotTracker);
+ O << ", stride = ";
+ getStride()->printAsOperand(O, SlotTracker);
+ O << ", runtimeVF = ";
+ getVF()->printAsOperand(O, SlotTracker);
+}
+#endif
+
void VPWidenStoreRecipe::execute(VPTransformState &State) {
VPValue *StoredVPValue = getStoredValue();
bool CreateScatter = !isConsecutive();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2242f95e4226c..61517786a27c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2850,10 +2850,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
- assert(all_of(Plan.getVF().users(),
- IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
- VPWidenIntOrFpInductionRecipe>) &&
- "User of VF that we can't transform to EVL.");
+ assert(
+ all_of(
+ Plan.getVF().users(),
+ IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
+ VPWidenIntOrFpInductionRecipe, VPWidenStridedLoadRecipe>) &&
+ "User of VF that we can't transform to EVL.");
Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
});
@@ -2950,8 +2952,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
"New recipe must define the same number of values as the "
"original.");
EVLRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPInterleaveEVLRecipe>(
- EVLRecipe)) {
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe,
+ VPInterleaveEVLRecipe>(EVLRecipe)) {
for (unsigned I = 0; I < NumDefVal; ++I) {
VPValue *CurVPV = CurRecipe->getVPValue(I);
CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index b9f5847ec731c..ae946836170ba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -358,6 +358,7 @@ class VPDef {
VPWidenCastSC,
VPWidenGEPSC,
VPWidenIntrinsicSC,
+ VPWidenStridedLoadSC,
VPWidenLoadEVLSC,
VPWidenLoadSC,
VPWidenStoreEVLSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 2d63d2a787f88..a36aee96e4154 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -162,7 +162,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
return VerifyEVLUse(*S, S->getNumOperands() - 1);
})
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe,
- VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(
+ VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe,
+ VPWidenStridedLoadRecipe>(
[&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
.Case<VPScalarIVStepsRecipe>([&](auto *R) {
if (R->getNumOperands() != 3) {
>From f95e9ad2161c1f2a5b617d271ccbf7f46c0e2d71 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 7 Jul 2025 01:02:03 -0700
Subject: [PATCH 02/25] Expand VPVectorPointerRecipe to support stride
---
.../Transforms/Vectorize/LoopVectorize.cpp | 5 ++++-
llvm/lib/Transforms/Vectorize/VPlan.h | 22 +++++++++++--------
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 16 ++++++++++----
.../LoopVectorize/vplan-dot-printing.ll | 4 ++--
.../LoopVectorize/vplan-printing.ll | 4 ++--
5 files changed, 33 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0d6cd9bb0de29..383d12225fd80 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7665,7 +7665,10 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
Ptr, &Plan.getVF(), getLoadStoreType(I),
/*Stride*/ -1, Flags, VPI->getDebugLoc());
} else {
- VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
+ const DataLayout &DL = I->getDataLayout();
+ auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType());
+ VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1));
+ VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne,
GEP ? GEP->getNoWrapFlags()
: GEPNoWrapFlags::none(),
VPI->getDebugLoc());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index dae37490a720f..1a7d787a741c8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1967,20 +1967,23 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
#endif
};
-/// A recipe to compute the pointers for widened memory accesses of IndexTy.
+/// A recipe to compute the pointers for widened memory accesses of IndexedTy,
+/// with the Stride expressed in units of IndexedTy.
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
- public VPUnrollPartAccessor<1> {
+ public VPUnrollPartAccessor<2> {
Type *SourceElementTy;
public:
- VPVectorPointerRecipe(VPValue *Ptr, Type *SourceElementTy,
+ VPVectorPointerRecipe(VPValue *Ptr, Type *SourceElementTy, VPValue *Stride,
GEPNoWrapFlags GEPFlags, DebugLoc DL)
- : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
- GEPFlags, DL),
+ : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC,
+ ArrayRef<VPValue *>({Ptr, Stride}), GEPFlags, DL),
SourceElementTy(SourceElementTy) {}
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
+ VPValue *getStride() const { return getOperand(1); }
+
void execute(VPTransformState &State) override;
Type *getSourceElementType() const { return SourceElementTy; }
@@ -2001,7 +2004,8 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
VPVectorPointerRecipe *clone() override {
return new VPVectorPointerRecipe(getOperand(0), SourceElementTy,
- getGEPNoWrapFlags(), getDebugLoc());
+ getStride(), getGEPNoWrapFlags(),
+ getDebugLoc());
}
/// Return true if this VPVectorPointerRecipe corresponds to part 0. Note that
@@ -3450,12 +3454,12 @@ struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
+ void printRecipe(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return Op == getAddr() || Op == getStride() || Op == getVF();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 333d490f66ef0..de9d627e15ef1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2533,8 +2533,14 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
Type *IndexTy = DL.getIndexType(State.TypeAnalysis.inferScalarType(this));
Value *Ptr = State.get(getOperand(0), VPLane(0));
+ Value *Stride = State.get(getStride(), /*IsScalar*/ true);
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
+ auto *StrideC = dyn_cast<ConstantInt>(Stride);
+ if (!StrideC || !StrideC->isOne()) {
+ Stride = Builder.CreateSExtOrTrunc(Stride, IndexTy);
+ Increment = Builder.CreateMul(Increment, Stride);
+ }
Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Increment,
"", getGEPNoWrapFlags());
@@ -3493,8 +3499,10 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
if (isa<VPWidenStridedLoadRecipe>(this))
- return Ctx.TTI.getStridedMemoryOpCost(
- Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient);
+ return Ctx.TTI.getMemIntrinsicInstrCost(
+ MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
+ Ty, Ptr, IsMasked, Alignment, &Ingredient),
+ Ctx.CostKind);
Type *PtrTy = Ptr->getType();
// If the address value is uniform across all lanes, then the address can be
@@ -3686,8 +3694,8 @@ void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+void VPWidenStridedLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN ";
printAsOperand(O, SlotTracker);
O << " = load ";
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
index ee3564bc87be4..75d2b29e180a9 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
@@ -42,11 +42,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
; CHECK-NEXT: " EMIT vp\<[[CAN_IV:%.+]]\> = CANONICAL-INDUCTION ir\<0\>, vp\<[[CAN_IV_NEXT:%.+]]\>\l" +
; CHECK-NEXT: " vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<1\>, vp\<[[VF]]\>\l" +
; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr inbounds ir\<%y\>, vp\<[[STEPS]]\>\l" +
-; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer inbounds ir\<%arrayidx\>\l" +
+; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer inbounds ir\<%arrayidx\>, ir\<1\>\l" +
; CHECK-NEXT: " WIDEN ir\<%lv\> = load vp\<[[VEC_PTR]]\>\l" +
; CHECK-NEXT: " WIDEN-INTRINSIC ir\<%call\> = call llvm.sqrt(ir\<%lv\>)\l" +
; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr inbounds ir\<%x\>, vp\<[[STEPS]]\>\l" +
-; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer inbounds ir\<%arrayidx2\>\l" +
+; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer inbounds ir\<%arrayidx2\>, ir\<1\>\l" +
; CHECK-NEXT: " WIDEN store vp\<[[VEC_PTR2]]\>, ir\<%call\>\l" +
; CHECK-NEXT: " EMIT vp\<[[CAN_IV_NEXT]]\> = add nuw vp\<[[CAN_IV]]\>, vp\<[[VFxUF]]\>\l" +
; CHECK-NEXT: " EMIT branch-on-count vp\<[[CAN_IV_NEXT]]\>, vp\<[[VEC_TC]]\>\l" +
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index f44dfe018aadd..33b216168431a 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -348,7 +348,7 @@ define void @recipe_debug_loc_location(ptr nocapture %src) !dbg !5 {
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<%isd> = getelementptr inbounds ir<%src>, vp<[[STEPS]]>, !dbg /tmp/s.c:5:3
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer inbounds ir<%isd>, !dbg /tmp/s.c:6:3
+; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer inbounds ir<%isd>, ir<1>, !dbg /tmp/s.c:6:3
; CHECK-NEXT: WIDEN ir<%lsd> = load vp<[[VEC_PTR]]>, !dbg /tmp/s.c:6:3
; CHECK-NEXT: WIDEN ir<%psd> = add nuw nsw ir<%lsd>, ir<23>, !dbg /tmp/s.c:7:3
; CHECK-NEXT: WIDEN ir<%cmp1> = icmp slt ir<%lsd>, ir<100>, !dbg /tmp/s.c:8:3
@@ -375,7 +375,7 @@ define void @recipe_debug_loc_location(ptr nocapture %src) !dbg !5 {
; CHECK-EMPTY:
; CHECK-NEXT: if.then.0:
; CHECK-NEXT: BLEND ir<%ysd.0> = ir<%psd> vp<[[PHI]]>/vp<[[OR1]]>, !dbg /tmp/s.c:14:3
-; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer inbounds ir<%isd>, !dbg /tmp/s.c:15:3
+; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer inbounds ir<%isd>, ir<1>, !dbg /tmp/s.c:15:3
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%ysd.0>, !dbg /tmp/s.c:15:3
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
>From 80796cff3bb06e2c7b2844a125635f0f3d70830a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 30 Jun 2025 20:38:38 -0700
Subject: [PATCH 03/25] Transform the gather to stride load
---
.../Transforms/Vectorize/LoopVectorize.cpp | 20 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 27 ++-
.../Transforms/Vectorize/VPlanTransforms.cpp | 183 ++++++++++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 6 +
.../RISCV/blocks-with-dead-instructions.ll | 13 +-
.../RISCV/masked_gather_scatter.ll | 120 +++++++-----
.../LoopVectorize/RISCV/pr154103.ll | 60 ++++--
.../LoopVectorize/RISCV/strided-accesses.ll | 180 ++++++++++-------
.../RISCV/tail-folding-gather-scatter.ll | 80 ++++++--
.../RISCV/tail-folding-interleave.ll | 174 ++++++++---------
10 files changed, 608 insertions(+), 255 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 383d12225fd80..0ec8eb3c13489 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8627,20 +8627,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan))
return nullptr;
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+ *CM.PSE.getSE(), OrigLoop);
// Transform recipes to abstract recipes if it is legal and beneficial and
// clamp the range for better cost estimation.
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
- if (!CM.foldTailWithEVL()) {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
- *CM.PSE.getSE(), OrigLoop);
+ if (!CM.foldTailWithEVL())
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
- }
-
- for (ElementCount VF : Range)
- Plan->addVF(VF);
- Plan->setName("Initial VPlan");
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
@@ -8653,6 +8648,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
Legal->getLAI()->getSymbolicStrides());
+ // Convert memory recipes to strided access recipes if the strided access is
+ // legal and profitable.
+ VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
+ CostCtx, Range);
+
+ for (ElementCount VF : Range)
+ Plan->addVF(VF);
+ Plan->setName("Initial VPlan");
+
auto BlockNeedsPredication = [this](BasicBlock *BB) {
return Legal->blockNeedsPredication(BB);
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1a7d787a741c8..e171ed948d24a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1849,10 +1849,6 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
Type *SourceElementTy;
- bool isPointerLoopInvariant() const {
- return getOperand(0)->isDefinedOutsideLoopRegions();
- }
-
bool isIndexLoopInvariant(unsigned I) const {
return getOperand(I + 1)->isDefinedOutsideLoopRegions();
}
@@ -1882,6 +1878,29 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
/// This recipe generates a GEP instruction.
unsigned getOpcode() const { return Instruction::GetElementPtr; }
+ bool isPointerLoopInvariant() const {
+ return getOperand(0)->isDefinedOutsideLoopRegions();
+ }
+
+ std::optional<unsigned> getUniqueVariantIndex() const {
+ std::optional<unsigned> VarIdx;
+ for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
+ if (isIndexLoopInvariant(I))
+ continue;
+
+ if (VarIdx)
+ return std::nullopt;
+ VarIdx = I;
+ }
+ return VarIdx;
+ }
+
+ Type *getIndexedType(unsigned I) const {
+ auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
+ SmallVector<Value *, 4> Ops(GEP->idx_begin(), GEP->idx_begin() + I);
+ return GetElementPtrInst::getIndexedType(SourceElementTy, Ops);
+ }
+
/// Generate the gep nodes.
void execute(VPTransformState &State) override;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 61517786a27c1..1c84c143083a9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5115,3 +5115,186 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
}
}
}
+
+static std::pair<VPValue *, VPValue *> matchStridedStart(VPValue *CurIndex) {
+ // TODO: Support VPWidenPointerInductionRecipe.
+ if (auto *WidenIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(CurIndex))
+ return {WidenIV, WidenIV->getStepValue()};
+
+ auto *WidenR = dyn_cast<VPWidenRecipe>(CurIndex);
+ if (!WidenR || !CurIndex->getUnderlyingValue())
+ return {nullptr, nullptr};
+
+ unsigned Opcode = WidenR->getOpcode();
+ // TODO: Support Instruction::Add and Instruction::Or.
+ if (Opcode != Instruction::Shl && Opcode != Instruction::Mul)
+ return {nullptr, nullptr};
+
+ // Match the pattern binop(variant, invariant), or binop(invariant, variant)
+ // if the binary operator is commutative.
+ bool IsLHSUniform = vputils::isSingleScalar(WidenR->getOperand(0));
+ if (IsLHSUniform == vputils::isSingleScalar(WidenR->getOperand(1)) ||
+ (IsLHSUniform && !Instruction::isCommutative(Opcode)))
+ return {nullptr, nullptr};
+ unsigned VarIdx = IsLHSUniform ? 1 : 0;
+
+ auto [Start, Stride] = matchStridedStart(WidenR->getOperand(VarIdx));
+ if (!Start)
+ return {nullptr, nullptr};
+
+ SmallVector<VPValue *> StartOps(WidenR->operands());
+ StartOps[VarIdx] = Start;
+ auto *StartR = new VPReplicateRecipe(WidenR->getUnderlyingInstr(), StartOps,
+ /*IsUniform*/ true);
+ StartR->insertBefore(WidenR);
+
+ unsigned InvIdx = VarIdx == 0 ? 1 : 0;
+ auto *StrideR =
+ new VPInstruction(Opcode, {Stride, WidenR->getOperand(InvIdx)});
+ StrideR->insertBefore(WidenR);
+ return {StartR, StrideR};
+}
+
+static std::tuple<VPValue *, VPValue *, Type *>
+determineBaseAndStride(VPWidenGEPRecipe *WidenGEP) {
+ // TODO: Check if the base pointer is strided.
+ if (!WidenGEP->isPointerLoopInvariant())
+ return {nullptr, nullptr, nullptr};
+
+ // Find the only one variant index.
+ std::optional<unsigned> VarIndex = WidenGEP->getUniqueVariantIndex();
+ if (!VarIndex)
+ return {nullptr, nullptr, nullptr};
+
+ Type *ElementTy = WidenGEP->getIndexedType(*VarIndex);
+ if (ElementTy->isScalableTy() || ElementTy->isStructTy() ||
+ ElementTy->isVectorTy())
+ return {nullptr, nullptr, nullptr};
+
+ unsigned VarOp = *VarIndex + 1;
+ VPValue *IndexVPV = WidenGEP->getOperand(VarOp);
+ auto [Start, Stride] = matchStridedStart(IndexVPV);
+ if (!Start)
+ return {nullptr, nullptr, nullptr};
+
+ SmallVector<VPValue *> Ops(WidenGEP->operands());
+ Ops[VarOp] = Start;
+ auto *BasePtr = new VPReplicateRecipe(WidenGEP->getUnderlyingInstr(), Ops,
+ /*IsUniform*/ true);
+ BasePtr->insertBefore(WidenGEP);
+
+ return {BasePtr, Stride, ElementTy};
+}
+
+void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
+ VFRange &Range) {
+ if (Plan.hasScalarVFOnly())
+ return;
+
+ VPTypeAnalysis TypeInfo(Plan);
+ DenseMap<VPWidenGEPRecipe *, std::tuple<VPValue *, VPValue *, Type *>>
+ StrideCache;
+ SmallVector<VPRecipeBase *> ToErase;
+ SmallPtrSet<VPValue *, 4> PossiblyDead;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R);
+ // TODO: Support strided store.
+ // TODO: Transform reverse access into strided access with -1 stride.
+ // TODO: Transform gather/scatter with uniform address into strided access
+ // with 0 stride.
+ // TODO: Transform interleave access into multiple strided accesses.
+ if (!MemR || !isa<VPWidenLoadRecipe>(MemR) || MemR->isConsecutive())
+ continue;
+
+ auto *Ptr = dyn_cast<VPWidenGEPRecipe>(MemR->getAddr());
+ if (!Ptr)
+ continue;
+
+ // Memory cost model requires the pointer operand of memory access
+ // instruction.
+ Value *PtrUV = Ptr->getUnderlyingValue();
+ if (!PtrUV)
+ continue;
+
+ // Try to get base and stride here.
+ VPValue *BasePtr, *StrideInElement;
+ Type *ElementTy;
+ auto It = StrideCache.find(Ptr);
+ if (It != StrideCache.end())
+ std::tie(BasePtr, StrideInElement, ElementTy) = It->second;
+ else
+ std::tie(BasePtr, StrideInElement, ElementTy) = StrideCache[Ptr] =
+ determineBaseAndStride(Ptr);
+
+ // Skip if the memory access is not a strided access.
+ if (!BasePtr) {
+ assert(!StrideInElement && !ElementTy);
+ continue;
+ }
+ assert(StrideInElement && ElementTy);
+
+ Instruction &Ingredient = MemR->getIngredient();
+ auto IsProfitable = [&](ElementCount VF) -> bool {
+ Type *DataTy = toVectorTy(getLoadStoreType(&Ingredient), VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+ if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
+ return false;
+ const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
+ const InstructionCost StridedLoadStoreCost =
+ Ctx.TTI.getMemIntrinsicInstrCost(
+ MemIntrinsicCostAttributes(
+ Intrinsic::experimental_vp_strided_load, DataTy, PtrUV,
+ MemR->isMasked(), Alignment, &Ingredient),
+ Ctx.CostKind);
+ return StridedLoadStoreCost < CurrentCost;
+ };
+
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable,
+ Range)) {
+ PossiblyDead.insert(BasePtr);
+ PossiblyDead.insert(StrideInElement);
+ continue;
+ }
+ PossiblyDead.insert(Ptr);
+
+ // Create a new vector pointer for strided access.
+ auto *GEP = dyn_cast<GetElementPtrInst>(PtrUV->stripPointerCasts());
+ auto *NewPtr = new VPVectorPointerRecipe(
+ BasePtr, ElementTy, StrideInElement,
+ GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
+ Ptr->getDebugLoc());
+ NewPtr->insertBefore(MemR);
+
+ const DataLayout &DL = Ingredient.getDataLayout();
+ TypeSize TS = DL.getTypeAllocSize(ElementTy);
+ unsigned TypeScale = TS.getFixedValue();
+ VPValue *StrideInBytes = StrideInElement;
+ // Scale the stride by the size of the indexed type.
+ if (TypeScale != 1) {
+ VPValue *ScaleVPV = Plan.getOrAddLiveIn(ConstantInt::get(
+ TypeInfo.inferScalarType(StrideInElement), TypeScale));
+ auto *ScaledStride =
+ new VPInstruction(Instruction::Mul, {StrideInElement, ScaleVPV});
+ ScaledStride->insertBefore(MemR);
+ StrideInBytes = ScaledStride;
+ }
+
+ auto *LoadR = cast<VPWidenLoadRecipe>(MemR);
+ auto *StridedLoad = new VPWidenStridedLoadRecipe(
+ *cast<LoadInst>(&Ingredient), NewPtr, StrideInBytes, &Plan.getVF(),
+ LoadR->getMask(), *LoadR, LoadR->getDebugLoc());
+ StridedLoad->insertBefore(LoadR);
+ LoadR->replaceAllUsesWith(StridedLoad);
+
+ ToErase.push_back(LoadR);
+ }
+ }
+
+ // Clean up dead memory access recipes, and unused base address and stride.
+ for (auto *R : ToErase)
+ R->eraseFromParent();
+ for (auto *V : PossiblyDead)
+ recursivelyDeleteDeadRecipes(V);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index afdf1655b4622..6af05d2383f5a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -253,6 +253,12 @@ struct VPlanTransforms {
&InterleaveGroups,
VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed);
+ /// Transform widen memory recipes into strided access recipes when legal
+ /// and profitable. Clamps \p Range to maintain consistency with widen
+ /// decisions of \p Plan, and uses \p Ctx to evaluate the cost.
+ static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
+ VFRange &Range);
+
/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index adf443b74acf1..0d392021460d0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -315,6 +315,8 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[IC]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = xor <vscale x 8 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
@@ -323,15 +325,23 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
; CHECK-NEXT: [[INDUCTION:%.*]] = add nsw <vscale x 8 x i64> zeroinitializer, [[TMP5]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP27]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP27]] to i64
; CHECK-NEXT: [[TMP16:%.*]] = mul nsw i64 3, [[TMP12]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP18:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
+; CHECK-NEXT: [[TMP19:%.*]] = icmp ult <vscale x 8 x i32> [[TMP18]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i16> @llvm.vp.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP27]])
+; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vp.strided.load.nxv8i16.p0.i64(ptr align 2 [[TMP21]], i64 6, <vscale x 8 x i1> [[TMP19]], i32 [[TMP15]])
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq <vscale x 8 x i16> [[WIDE_MASKED_GATHER]], zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = select <vscale x 8 x i1> [[TMP17]], <vscale x 8 x i1> [[TMP8]], <vscale x 8 x i1> zeroinitializer
; CHECK-NEXT: [[TMP28:%.*]] = xor <vscale x 8 x i1> [[TMP17]], splat (i1 true)
@@ -339,6 +349,7 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
; CHECK-NEXT: [[TMP23:%.*]] = select <vscale x 8 x i1> [[TMP17]], <vscale x 8 x i1> [[BROADCAST_SPLAT]], <vscale x 8 x i1> zeroinitializer
; CHECK-NEXT: [[TMP24:%.*]] = or <vscale x 8 x i1> [[TMP22]], [[TMP23]]
; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> [[TMP24]], i32 [[TMP27]])
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index f49f9284a1e93..dfa46bf054fc4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -16,7 +16,7 @@
define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
; RV32-LABEL: @foo4(
; RV32-NEXT: entry:
-; RV32-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
+; RV32-NEXT: br label [[VECTOR_SCEVCHECK:%.*]]
; RV32: vector.scevcheck:
; RV32-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 128, i32 624)
; RV32-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
@@ -31,63 +31,75 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV32-NEXT: [[TMP4:%.*]] = icmp ult ptr [[TMP3]], [[B]]
; RV32-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW3]]
; RV32-NEXT: [[TMP6:%.*]] = or i1 [[TMP2]], [[TMP5]]
-; RV32-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK1:%.*]]
+; RV32-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; RV32: vector.memcheck:
-; RV32-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940
-; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i32 79880
-; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i32 159752
-; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
-; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940
+; RV32-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[A]], i32 79880
+; RV32-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[B]], i32 159752
+; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP4]]
+; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
; RV32-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; RV32-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
-; RV32-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
-; RV32-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
-; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; RV32-NEXT: [[BOUND06:%.*]] = icmp ult ptr [[A]], [[SCEVGEP5]]
+; RV32-NEXT: [[BOUND17:%.*]] = icmp ult ptr [[B]], [[SCEVGEP4]]
+; RV32-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]]
+; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT8]]
; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; RV32: vector.ph:
-; RV32-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; RV32-NEXT: [[TMP9:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP7]], splat (i64 16)
-; RV32-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP9]]
+; RV32-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+; RV32-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; RV32-NEXT: [[TMP10:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP9]], splat (i64 16)
+; RV32-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP10]]
; RV32-NEXT: br label [[VECTOR_BODY:%.*]]
; RV32: vector.body:
+; RV32-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; RV32-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; RV32-NEXT: [[AVL:%.*]] = phi i64 [ 625, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV32-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; RV32-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64
-; RV32-NEXT: [[TMP11:%.*]] = mul nuw nsw i64 16, [[TMP8]]
-; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; RV32-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100)
-; RV32-NEXT: [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 1)
-; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
-; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]]
-; RV32-NEXT: [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
-; RV32-NEXT: [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
-; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3]], !noalias [[META5]]
-; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
-; RV32-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; RV32-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; RV32-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; RV32-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP11]], i64 0
+; RV32-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT9]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; RV32-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
+; RV32-NEXT: [[TMP13:%.*]] = mul nuw nsw i64 16, [[TMP12]]
+; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP13]], i64 0
+; RV32-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; RV32-NEXT: [[TMP14:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; RV32-NEXT: [[TMP15:%.*]] = icmp ult <vscale x 2 x i32> [[TMP14]], [[BROADCAST_SPLAT10]]
+; RV32-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 16
+; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[OFFSET_IDX]]
+; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP8]] to i32
+; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vp.strided.load.nxv2i32.p0.i64(ptr align 4 [[TMP16]], i64 64, <vscale x 2 x i1> [[TMP15]], i32 [[TMP17]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; RV32-NEXT: [[TMP18:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 100)
+; RV32-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[TMP18]], <vscale x 2 x i1> zeroinitializer
+; RV32-NEXT: [[TMP20:%.*]] = shl nuw nsw i64 [[OFFSET_IDX]], 1
+; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP20]]
+; RV32-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP8]] to i32
+; RV32-NEXT: [[WIDE_STRIDED_LOAD11:%.*]] = call <vscale x 2 x double> @llvm.experimental.vp.strided.load.nxv2f64.p0.i64(ptr align 8 [[TMP21]], i64 256, <vscale x 2 x i1> [[TMP19]], i32 [[TMP22]]), !alias.scope [[META5:![0-9]+]]
+; RV32-NEXT: [[TMP23:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_STRIDED_LOAD]] to <vscale x 2 x double>
+; RV32-NEXT: [[TMP24:%.*]] = fadd <vscale x 2 x double> [[WIDE_STRIDED_LOAD11]], [[TMP23]]
+; RV32-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
+; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP24]], <vscale x 2 x ptr> align 8 [[TMP25]], <vscale x 2 x i1> [[TMP18]], i32 [[TMP11]]), !alias.scope [[META3]], !noalias [[META5]]
+; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
+; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
+; RV32-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; RV32-NEXT: [[TMP26:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; RV32-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; RV32: middle.block:
; RV32-NEXT: br label [[FOR_END:%.*]]
; RV32: scalar.ph:
-; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ]
+; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; RV32-NEXT: br label [[FOR_BODY:%.*]]
; RV32: for.body:
; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
-; RV32-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100
+; RV32-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP27]], 100
; RV32-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; RV32: if.then:
-; RV32-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP22]]
-; RV32-NEXT: [[TMP23:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
-; RV32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP21]] to double
-; RV32-NEXT: [[ADD:%.*]] = fadd double [[TMP23]], [[CONV]]
+; RV32-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP28]]
+; RV32-NEXT: [[TMP29:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
+; RV32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP27]] to double
+; RV32-NEXT: [[ADD:%.*]] = fadd double [[TMP29]], [[CONV]]
; RV32-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
; RV32-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8
; RV32-NEXT: br label [[FOR_INC]]
@@ -114,28 +126,40 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV64-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
; RV64-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; RV64: vector.ph:
+; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
; RV64-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; RV64-NEXT: [[TMP1:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP7]], splat (i64 16)
-; RV64-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP1]]
+; RV64-NEXT: [[TMP3:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP7]], splat (i64 16)
+; RV64-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP3]]
; RV64-NEXT: br label [[VECTOR_BODY:%.*]]
; RV64: vector.body:
+; RV64-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; RV64-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; RV64-NEXT: [[AVL:%.*]] = phi i64 [ 625, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; RV64-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; RV64-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP10]], i64 0
+; RV64-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT6]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
; RV64-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64
; RV64-NEXT: [[TMP11:%.*]] = mul nuw nsw i64 16, [[TMP8]]
; RV64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
; RV64-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
-; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]]
+; RV64-NEXT: [[TMP16:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; RV64-NEXT: [[TMP25:%.*]] = icmp ult <vscale x 2 x i32> [[TMP16]], [[BROADCAST_SPLAT7]]
+; RV64-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 16
+; RV64-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[OFFSET_IDX]]
+; RV64-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP1]] to i32
+; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vp.strided.load.nxv2i32.p0.i64(ptr align 4 [[TMP26]], i64 64, <vscale x 2 x i1> [[TMP25]], i32 [[TMP27]]), !alias.scope [[META0:![0-9]+]]
; RV64-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100)
-; RV64-NEXT: [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 1)
-; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
-; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]]
+; RV64-NEXT: [[TMP12:%.*]] = select <vscale x 2 x i1> [[TMP25]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
+; RV64-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[OFFSET_IDX]], 1
+; RV64-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP13]]
+; RV64-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP1]] to i32
+; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.experimental.vp.strided.load.nxv2f64.p0.i64(ptr align 8 [[TMP28]], i64 256, <vscale x 2 x i1> [[TMP12]], i32 [[TMP15]]), !alias.scope [[META3:![0-9]+]]
; RV64-NEXT: [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
; RV64-NEXT: [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
; RV64-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
; RV64-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
; RV64-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
index c35a3d7b9269f..7503165665642 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
@@ -6,29 +6,47 @@
define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d) {
; CHECK-LABEL: define void @pr154103(
; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[C]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ -7905747460161236406, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP2]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[EVL_BASED_IV]], 7
+; CHECK-NEXT: [[IV:%.*]] = add i64 1, [[TMP5]]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT: [[X:%.*]] = load i8, ptr [[GEP]], align 1
-; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[X]] to i64
-; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 0, [[CONV]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[DIV]], 0
-; CHECK-NEXT: br i1 [[CMP]], label %[[THEN:.*]], label %[[LATCH]]
-; CHECK: [[THEN]]:
-; CHECK-NEXT: [[Y:%.*]] = load i8, ptr [[B]], align 1
-; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[Y]] to i64
-; CHECK-NEXT: [[NOT:%.*]] = xor i64 [[ZEXT]], 0
-; CHECK-NEXT: br label %[[LATCH]]
-; CHECK: [[LATCH]]:
-; CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[NOT]], %[[THEN]] ], [ 0, %[[LOOP]] ]
-; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[COND]] to i16
-; CHECK-NEXT: store i16 [[TRUNC]], ptr [[C]], align 2
+; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr align 1 [[GEP]], i64 7, <vscale x 4 x i1> [[TMP4]], i32 [[TMP7]])
+; CHECK-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_STRIDED_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> [[TMP8]], <vscale x 4 x i64> splat (i64 1), i32 [[TMP2]])
+; CHECK-NEXT: [[TMP10:%.*]] = sdiv <vscale x 4 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp sgt <vscale x 4 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i8> @llvm.vp.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP11]], i32 [[TMP2]])
+; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <vscale x 4 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i64> [[TMP13]], <vscale x 4 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = trunc <vscale x 4 x i64> [[PREDPHI]] to <vscale x 4 x i16>
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP14]], <vscale x 4 x ptr> align 2 [[BROADCAST_SPLAT2]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP2]])
; CHECK-NEXT: store i32 0, ptr [[D]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 7
-; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 96e1ebb48b02c..2b810e8f476b4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -10,22 +10,33 @@ define void @single_constant_stride_int_scaled(ptr %p) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 1)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ult <vscale x 4 x i32> [[TMP6]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i64 [[EVL_BASED_IV]], 8
; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], splat (i64 8)
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[TMP14]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP14]]
+; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 32, <vscale x 4 x i1> [[TMP7]], i32 [[TMP19]])
; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
@@ -59,12 +70,20 @@ define void @single_constant_stride_int_scaled(ptr %p) {
; CHECK-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-UF2-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-UF2-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-UF2-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[INDEX]], 8
; CHECK-UF2-NEXT: [[TMP9:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], splat (i64 8)
; CHECK-UF2-NEXT: [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[STEP_ADD]], splat (i64 8)
-; CHECK-UF2-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-UF2-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP21]]
+; CHECK-UF2-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP9]]
; CHECK-UF2-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP10]]
-; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-UF2-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UF2-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP23]], 2
+; CHECK-UF2-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 8
+; CHECK-UF2-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP17]]
+; CHECK-UF2-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP22]], i64 32, <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
+; CHECK-UF2-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 32, <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; CHECK-UF2-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; CHECK-UF2-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], splat (i32 1)
; CHECK-UF2-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true))
@@ -114,22 +133,33 @@ define void @single_constant_stride_int_iv(ptr %p) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP6]], splat (i64 64)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> zeroinitializer, [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP6]], splat (i64 64)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> zeroinitializer, [[TMP3]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP7]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i64 64, [[TMP11]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ult <vscale x 4 x i32> [[TMP15]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 64
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP17]], i64 256, <vscale x 4 x i1> [[TMP16]], i32 [[TMP18]])
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
@@ -163,10 +193,18 @@ define void @single_constant_stride_int_iv(ptr %p) {
; CHECK-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-UF2-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-UF2-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[TMP6]]
-; CHECK-UF2-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-UF2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 64
+; CHECK-UF2-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-UF2-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[VEC_IND]]
; CHECK-UF2-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[STEP_ADD]]
-; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-UF2-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UF2-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 2
+; CHECK-UF2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP20]], 64
+; CHECK-UF2-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP14]]
+; CHECK-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 256, <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
+; CHECK-UF2-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP15]], i64 256, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
; CHECK-UF2-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; CHECK-UF2-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], splat (i32 1)
; CHECK-UF2-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true))
@@ -775,6 +813,9 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; STRIDED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; STRIDED: vector.ph:
+; STRIDED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-NEXT: [[TMP45:%.*]] = mul nuw i64 [[TMP42]], 4
+; STRIDED-NEXT: [[TMP47:%.*]] = mul i64 [[STRIDE]], 4
; STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[STRIDE]], i64 0
; STRIDED-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; STRIDED-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
@@ -782,18 +823,26 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
; STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
; STRIDED: vector.body:
+; STRIDED-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; STRIDED-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; STRIDED-NEXT: [[TMP43:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; STRIDED-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP43]], i64 0
+; STRIDED-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT11]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; STRIDED-NEXT: [[TMP44:%.*]] = zext i32 [[TMP43]] to i64
; STRIDED-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP44]], i64 0
; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; STRIDED-NEXT: [[TMP48:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; STRIDED-NEXT: [[TMP49:%.*]] = icmp ult <vscale x 4 x i32> [[TMP48]], [[BROADCAST_SPLAT12]]
+; STRIDED-NEXT: [[TMP50:%.*]] = mul nuw nsw i64 [[EVL_BASED_IV]], [[STRIDE]]
; STRIDED-NEXT: [[TMP18:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT1]]
-; STRIDED-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP18]]
-; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META5:![0-9]+]]
+; STRIDED-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP50]]
+; STRIDED-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP45]] to i32
+; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP46]], i64 [[TMP47]], <vscale x 4 x i1> [[TMP49]], i32 [[TMP51]]), !alias.scope [[META5:![0-9]+]]
; STRIDED-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; STRIDED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP18]]
; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
+; STRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP44]], [[EVL_BASED_IV]]
; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP44]]
; STRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; STRIDED-NEXT: [[TMP41:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
@@ -879,6 +928,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-UF2-NEXT: [[TMP30:%.*]] = mul nuw i64 [[TMP29]], 2
; STRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP30]]
; STRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; STRIDED-UF2-NEXT: [[TMP35:%.*]] = mul i64 [[STRIDE]], 4
; STRIDED-UF2-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[STRIDE]], i64 0
; STRIDED-UF2-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT10]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; STRIDED-UF2-NEXT: [[TMP31:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
@@ -889,12 +939,18 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; STRIDED-UF2-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; STRIDED-UF2-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; STRIDED-UF2-NEXT: [[TMP36:%.*]] = mul nuw nsw i64 [[INDEX]], [[STRIDE]]
; STRIDED-UF2-NEXT: [[TMP33:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT11]]
; STRIDED-UF2-NEXT: [[TMP34:%.*]] = mul nuw nsw <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT11]]
-; STRIDED-UF2-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP33]]
-; STRIDED-UF2-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP34]]
-; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP35]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META8:![0-9]+]]
-; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP36]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META8]]
+; STRIDED-UF2-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP36]]
+; STRIDED-UF2-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-UF2-NEXT: [[TMP46:%.*]] = shl nuw i64 [[TMP45]], 2
+; STRIDED-UF2-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], [[STRIDE]]
+; STRIDED-UF2-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[TMP44]], i64 [[TMP47]]
+; STRIDED-UF2-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP29]] to i32
+; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP44]], i64 [[TMP35]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP42]]), !alias.scope [[META8:![0-9]+]]
+; STRIDED-UF2-NEXT: [[TMP43:%.*]] = trunc i64 [[TMP29]] to i32
+; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP48]], i64 [[TMP35]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META8]]
; STRIDED-UF2-NEXT: [[TMP37:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; STRIDED-UF2-NEXT: [[TMP38:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER12]], splat (i32 1)
; STRIDED-UF2-NEXT: [[TMP39:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP33]]
@@ -1298,25 +1354,25 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; NOSTRIDED-NEXT: entry:
; NOSTRIDED-NEXT: br label [[VECTOR_PH:%.*]]
; NOSTRIDED: vector.ph:
-; NOSTRIDED-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; NOSTRIDED-NEXT: [[TMP1:%.*]] = mul <vscale x 2 x i64> [[TMP0]], splat (i64 1)
-; NOSTRIDED-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP1]]
+; NOSTRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NOSTRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
; NOSTRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
; NOSTRIDED: vector.body:
; NOSTRIDED-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NOSTRIDED-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; NOSTRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; NOSTRIDED-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; NOSTRIDED-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
-; NOSTRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP3]], i64 0
-; NOSTRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; NOSTRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], <vscale x 2 x i64> [[VEC_IND]]
-; NOSTRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
+; NOSTRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP2]], i64 0
+; NOSTRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; NOSTRIDED-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; NOSTRIDED-NEXT: [[TMP4:%.*]] = icmp ult <vscale x 2 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
+; NOSTRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], i64 [[EVL_BASED_IV]]
+; NOSTRIDED-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP1]] to i32
+; NOSTRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP9]], i64 4, <vscale x 2 x i1> [[TMP4]], i32 [[TMP10]])
; NOSTRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[EVL_BASED_IV]]
; NOSTRIDED-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
-; NOSTRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP3]], [[EVL_BASED_IV]]
-; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
-; NOSTRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; NOSTRIDED-NEXT: [[TMP11:%.*]] = zext i32 [[TMP2]] to i64
+; NOSTRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
+; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; NOSTRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; NOSTRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; NOSTRIDED: middle.block:
@@ -1333,23 +1389,20 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; NOSTRIDED-UF2: vector.ph:
; NOSTRIDED-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NOSTRIDED-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; NOSTRIDED-UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP3]], i64 0
-; NOSTRIDED-UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; NOSTRIDED-UF2-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
; NOSTRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
; NOSTRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; NOSTRIDED-UF2-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; NOSTRIDED-UF2-NEXT: [[TMP6:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP5]], splat (i64 1)
-; NOSTRIDED-UF2-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP6]]
; NOSTRIDED-UF2-NEXT: br label [[VECTOR_BODY:%.*]]
; NOSTRIDED-UF2: vector.body:
; NOSTRIDED-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NOSTRIDED-UF2-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NOSTRIDED-UF2-NEXT: [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; NOSTRIDED-UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], <vscale x 2 x i64> [[VEC_IND]]
-; NOSTRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], <vscale x 2 x i64> [[STEP_ADD]]
-; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
-; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; NOSTRIDED-UF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], i64 [[INDEX]]
+; NOSTRIDED-UF2-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NOSTRIDED-UF2-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 1
+; NOSTRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i64 [[TMP7]]
+; NOSTRIDED-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP3]] to i32
+; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP5]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP15]])
+; NOSTRIDED-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP3]] to i32
+; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP8]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP16]])
; NOSTRIDED-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[INDEX]]
; NOSTRIDED-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
; NOSTRIDED-UF2-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
@@ -1357,7 +1410,6 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; NOSTRIDED-UF2-NEXT: store <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], ptr [[TMP9]], align 8
; NOSTRIDED-UF2-NEXT: store <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], ptr [[TMP12]], align 8
; NOSTRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; NOSTRIDED-UF2-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
; NOSTRIDED-UF2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NOSTRIDED-UF2-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; NOSTRIDED-UF2: middle.block:
@@ -1382,25 +1434,25 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; STRIDED-NEXT: entry:
; STRIDED-NEXT: br label [[VECTOR_PH:%.*]]
; STRIDED: vector.ph:
-; STRIDED-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; STRIDED-NEXT: [[TMP1:%.*]] = mul <vscale x 2 x i64> [[TMP0]], splat (i64 1)
-; STRIDED-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP1]]
+; STRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
; STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
; STRIDED: vector.body:
; STRIDED-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; STRIDED-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; STRIDED-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; STRIDED-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
-; STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP3]], i64 0
-; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], <vscale x 2 x i64> [[VEC_IND]]
-; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
+; STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP2]], i64 0
+; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; STRIDED-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; STRIDED-NEXT: [[TMP4:%.*]] = icmp ult <vscale x 2 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
+; STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], i64 [[EVL_BASED_IV]]
+; STRIDED-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP1]] to i32
+; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP9]], i64 4, <vscale x 2 x i1> [[TMP4]], i32 [[TMP10]])
; STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[EVL_BASED_IV]]
; STRIDED-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
-; STRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP3]], [[EVL_BASED_IV]]
-; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
-; STRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; STRIDED-NEXT: [[TMP11:%.*]] = zext i32 [[TMP2]] to i64
+; STRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
+; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; STRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; STRIDED: middle.block:
@@ -1417,23 +1469,20 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; STRIDED-UF2: vector.ph:
; STRIDED-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; STRIDED-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; STRIDED-UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP3]], i64 0
-; STRIDED-UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; STRIDED-UF2-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
; STRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
; STRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; STRIDED-UF2-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; STRIDED-UF2-NEXT: [[TMP6:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP5]], splat (i64 1)
-; STRIDED-UF2-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP6]]
; STRIDED-UF2-NEXT: br label [[VECTOR_BODY:%.*]]
; STRIDED-UF2: vector.body:
; STRIDED-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; STRIDED-UF2-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; STRIDED-UF2-NEXT: [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; STRIDED-UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], <vscale x 2 x i64> [[VEC_IND]]
-; STRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], <vscale x 2 x i64> [[STEP_ADD]]
-; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
-; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; STRIDED-UF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], i64 [[INDEX]]
+; STRIDED-UF2-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-UF2-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 1
+; STRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i64 [[TMP7]]
+; STRIDED-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP3]] to i32
+; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP5]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP15]])
+; STRIDED-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP3]] to i32
+; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP8]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP16]])
; STRIDED-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[INDEX]]
; STRIDED-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
; STRIDED-UF2-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
@@ -1441,7 +1490,6 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; STRIDED-UF2-NEXT: store <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], ptr [[TMP9]], align 8
; STRIDED-UF2-NEXT: store <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], ptr [[TMP12]], align 8
; STRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; STRIDED-UF2-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
; STRIDED-UF2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; STRIDED-UF2-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; STRIDED-UF2: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
index ba7005f4f56dc..3be12edbc4a49 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
@@ -10,35 +10,77 @@
define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %index, i64 %n) {
; IF-EVL-LABEL: @gather_scatter(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]]
-; IF-EVL: for.body:
-; IF-EVL-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ]
-; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV1]]
-; IF-EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
-; IF-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; IF-EVL-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: store float [[TMP1]], ptr [[ARRAYIDX7]], align 4
-; IF-EVL-NEXT: [[INDVARS_IV_NEXT1]] = add nuw nsw i64 [[INDVARS_IV1]], 1
-; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N:%.*]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY1]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP2]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; IF-EVL-NEXT: [[TMP4:%.*]] = icmp ult <vscale x 2 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP12]] to i32
+; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP5]], i64 4, <vscale x 2 x i1> [[TMP4]], i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <vscale x 2 x i64> [[WIDE_STRIDED_LOAD]]
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x float> @llvm.vp.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
+; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <vscale x 2 x i64> [[WIDE_STRIDED_LOAD]]
+; IF-EVL-NEXT: call void @llvm.vp.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> [[WIDE_MASKED_GATHER]], <vscale x 2 x ptr> align 4 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
+; IF-EVL-NEXT: [[TMP9:%.*]] = zext i32 [[TMP2]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; IF-EVL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL: for.end:
; IF-EVL-NEXT: ret void
;
; NO-VP-LABEL: @gather_scatter(
; NO-VP-NEXT: entry:
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP4]], 1
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP9]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; NO-VP-NEXT: br label [[FOR_BODY1:%.*]]
-; NO-VP: for.body:
-; NO-VP-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ]
; NO-VP-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV1]]
-; NO-VP-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
-; NO-VP-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; NO-VP-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[ARRAYIDX3]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <vscale x 2 x i64> [[WIDE_STRIDED_LOAD]]
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP6]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x float> poison)
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <vscale x 2 x i64> [[WIDE_STRIDED_LOAD]]
+; NO-VP-NEXT: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> [[WIDE_MASKED_GATHER]], <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true))
+; NO-VP-NEXT: [[INDVARS_IV_NEXT1]] = add nuw i64 [[INDVARS_IV1]], [[TMP3]]
+; NO-VP-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX4]], align 8
+; NO-VP-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP0]]
; NO-VP-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; NO-VP-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP0]]
; NO-VP-NEXT: store float [[TMP1]], ptr [[ARRAYIDX7]], align 4
-; NO-VP-NEXT: [[INDVARS_IV_NEXT1]] = add nuw nsw i64 [[INDVARS_IV1]], 1
-; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N:%.*]]
-; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY1]]
+; NO-VP-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; NO-VP: for.end:
; NO-VP-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index b5662b0bd8d3b..e6f3ee4e1b2f3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -118,29 +118,33 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
-; IF-EVL-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT: [[TMP1:%.*]] = mul <vscale x 4 x i64> [[TMP2]], splat (i64 1)
-; IF-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP1]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
-; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; IF-EVL-NEXT: [[TMP7:%.*]] = icmp ult <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[EVL_BASED_IV]], i32 0
+; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP1]] to i32
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, <vscale x 4 x i1> [[TMP7]], i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP1]] to i32
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, <vscale x 4 x i1> [[TMP7]], i32 [[TMP19]])
; IF-EVL-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[WIDE_MASKED_GATHER1]]
-; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 3
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[EVL_BASED_IV]], i32 3
+; IF-EVL-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP1]] to i32
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 16, <vscale x 4 x i1> [[TMP7]], i32 [[TMP20]])
; IF-EVL-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[WIDE_MASKED_GATHER2]]
; IF-EVL-NEXT: [[TMP12]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP4]])
-; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
-; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP4]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]]
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; IF-EVL: middle.block:
@@ -162,26 +166,22 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
; NO-VP-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; NO-VP-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[TMP5]]
-; NO-VP-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; NO-VP-NEXT: [[TMP7:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP6]], splat (i64 1)
-; NO-VP-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> zeroinitializer, [[TMP7]]
-; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP3]], i64 0
-; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-VP: vector.body:
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[INDEX]], i32 0
+; NO-VP-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP3]] to i32
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; NO-VP-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP3]] to i32
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; NO-VP-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[WIDE_MASKED_GATHER1]]
-; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 3
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[INDEX]], i32 3
+; NO-VP-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP3]] to i32
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP19]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
; NO-VP-NEXT: [[TMP13]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; NO-VP-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; NO-VP-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NO-VP-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; NO-VP: middle.block:
@@ -358,29 +358,33 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
-; IF-EVL-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT: [[TMP1:%.*]] = mul <vscale x 4 x i64> [[TMP2]], splat (i64 1)
-; IF-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP1]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
-; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; IF-EVL-NEXT: [[TMP7:%.*]] = icmp ult <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[EVL_BASED_IV]], i32 0
+; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP1]] to i32
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, <vscale x 4 x i1> [[TMP7]], i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP1]] to i32
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, <vscale x 4 x i1> [[TMP7]], i32 [[TMP19]])
; IF-EVL-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[WIDE_MASKED_GATHER1]]
-; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 2
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[EVL_BASED_IV]], i32 2
+; IF-EVL-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP1]] to i32
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 16, <vscale x 4 x i1> [[TMP7]], i32 [[TMP20]])
; IF-EVL-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[WIDE_MASKED_GATHER2]]
; IF-EVL-NEXT: [[TMP12]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP4]])
-; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
-; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP4]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]]
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; IF-EVL: middle.block:
@@ -402,26 +406,22 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
; NO-VP-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; NO-VP-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[TMP5]]
-; NO-VP-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; NO-VP-NEXT: [[TMP7:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP6]], splat (i64 1)
-; NO-VP-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> zeroinitializer, [[TMP7]]
-; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP3]], i64 0
-; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-VP: vector.body:
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[INDEX]], i32 0
+; NO-VP-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP3]] to i32
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; NO-VP-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP3]] to i32
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; NO-VP-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[WIDE_MASKED_GATHER1]]
-; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 2
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[INDEX]], i32 2
+; NO-VP-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP3]] to i32
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP19]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
; NO-VP-NEXT: [[TMP13]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; NO-VP-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; NO-VP-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NO-VP-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; NO-VP: middle.block:
@@ -591,35 +591,38 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
; IF-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[N]], [[SMIN]]
; IF-EVL-NEXT: br label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
-; IF-EVL-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[N]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP3:%.*]] = mul nsw <vscale x 4 x i64> [[TMP4]], splat (i64 -1)
-; IF-EVL-NEXT: [[INDUCTION:%.*]] = add nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP3]]
+; IF-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
-; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP1]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT: [[TMP8:%.*]] = mul nsw i64 -1, [[TMP7]]
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP6]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult <vscale x 4 x i32> [[TMP5]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
+; IF-EVL-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i32
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, <vscale x 4 x i1> [[TMP9]], i32 [[TMP8]])
; IF-EVL-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP3]] to i32
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, <vscale x 4 x i1> [[TMP9]], i32 [[TMP14]])
; IF-EVL-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[WIDE_MASKED_GATHER3]]
-; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 2
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 2
+; IF-EVL-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP3]] to i32
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP12]], i64 -16, <vscale x 4 x i1> [[TMP9]], i32 [[TMP24]])
; IF-EVL-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER4]]
-; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 3
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 3
+; IF-EVL-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP3]] to i32
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP25]], i64 -16, <vscale x 4 x i1> [[TMP9]], i32 [[TMP26]])
; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP13]], [[WIDE_MASKED_GATHER5]]
; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP15]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP6]])
-; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
-; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP27]]
; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; IF-EVL: middle.block:
@@ -643,32 +646,27 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[N]], [[N_VEC]]
-; NO-VP-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[N]], i64 0
-; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-VP-NEXT: [[TMP8:%.*]] = mul nsw <vscale x 4 x i64> [[TMP7]], splat (i64 -1)
-; NO-VP-NEXT: [[INDUCTION:%.*]] = add nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP8]]
-; NO-VP-NEXT: [[TMP9:%.*]] = mul nsw i64 -1, [[TMP5]]
-; NO-VP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
-; NO-VP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-VP: vector.body:
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
+; NO-VP-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP5]] to i32
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
; NO-VP-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
; NO-VP-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER3]]
-; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 2
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 2
+; NO-VP-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP5]] to i32
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP23]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP13]])
; NO-VP-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[TMP12]], [[WIDE_MASKED_GATHER4]]
-; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 3
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 3
+; NO-VP-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP5]] to i32
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP15]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP24]])
; NO-VP-NEXT: [[TMP16]] = add <vscale x 4 x i32> [[TMP14]], [[WIDE_MASKED_GATHER5]]
; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-VP-NEXT: [[VEC_IND_NEXT]] = add nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; NO-VP-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NO-VP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; NO-VP: middle.block:
>From 2fa6c4bd35e3b34072e62a3242d802fe91ea347a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 22 Aug 2025 01:28:33 -0700
Subject: [PATCH 04/25] Support EVL
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 8 +++
.../RISCV/blocks-with-dead-instructions.ll | 9 +--
.../RISCV/masked_gather_scatter.ll | 64 +++++++------------
.../LoopVectorize/RISCV/pr154103.ll | 9 +--
.../LoopVectorize/RISCV/strided-accesses.ll | 53 +++------------
.../RISCV/tail-folding-gather-scatter.ll | 9 +--
.../RISCV/tail-folding-interleave.ll | 48 +++-----------
7 files changed, 53 insertions(+), 147 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1c84c143083a9..42dc9677845b7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2797,6 +2797,14 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
AdjustEndPtr(EndPtr), EVL, Mask);
+ if (auto *StridedL = dyn_cast<VPWidenStridedLoadRecipe>(&CurRecipe))
+ if (StridedL->isMasked() &&
+ match(StridedL->getMask(), m_RemoveMask(HeaderMask, Mask)))
+ return new VPWidenStridedLoadRecipe(
+ *cast<LoadInst>(&StridedL->getIngredient()), StridedL->getAddr(),
+ StridedL->getStride(), &EVL, Mask, *StridedL,
+ StridedL->getDebugLoc());
+
if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
m_RemoveMask(HeaderMask, Mask))) &&
!cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index 0d392021460d0..a1619f71a438b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -315,8 +315,6 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[IC]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = xor <vscale x 8 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
@@ -329,19 +327,14 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP27]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP27]] to i64
; CHECK-NEXT: [[TMP16:%.*]] = mul nsw i64 3, [[TMP12]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP18:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
-; CHECK-NEXT: [[TMP19:%.*]] = icmp ult <vscale x 8 x i32> [[TMP18]], [[BROADCAST_SPLAT4]]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 3
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP4]] to i32
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vp.strided.load.nxv8i16.p0.i64(ptr align 2 [[TMP21]], i64 6, <vscale x 8 x i1> [[TMP19]], i32 [[TMP15]])
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vp.strided.load.nxv8i16.p0.i64(ptr align 2 [[TMP21]], i64 6, <vscale x 8 x i1> splat (i1 true), i32 [[TMP27]])
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq <vscale x 8 x i16> [[WIDE_MASKED_GATHER]], zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = select <vscale x 8 x i1> [[TMP17]], <vscale x 8 x i1> [[TMP8]], <vscale x 8 x i1> zeroinitializer
; CHECK-NEXT: [[TMP28:%.*]] = xor <vscale x 8 x i1> [[TMP17]], splat (i1 true)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index dfa46bf054fc4..e4da9d66fb41e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -45,44 +45,35 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT8]]
; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; RV32: vector.ph:
-; RV32-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
-; RV32-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; RV32-NEXT: [[TMP10:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP9]], splat (i64 16)
-; RV32-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP10]]
+; RV32-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; RV32-NEXT: [[TMP8:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP7]], splat (i64 16)
+; RV32-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP8]]
; RV32-NEXT: br label [[VECTOR_BODY:%.*]]
; RV32: vector.body:
; RV32-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; RV32-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; RV32-NEXT: [[AVL:%.*]] = phi i64 [ 625, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV32-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; RV32-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP11]], i64 0
-; RV32-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT9]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; RV32-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
-; RV32-NEXT: [[TMP13:%.*]] = mul nuw nsw i64 16, [[TMP12]]
-; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP13]], i64 0
+; RV32-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; RV32-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+; RV32-NEXT: [[TMP11:%.*]] = mul nuw nsw i64 16, [[TMP10]]
+; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
; RV32-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; RV32-NEXT: [[TMP14:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; RV32-NEXT: [[TMP15:%.*]] = icmp ult <vscale x 2 x i32> [[TMP14]], [[BROADCAST_SPLAT10]]
; RV32-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 16
-; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[OFFSET_IDX]]
-; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP8]] to i32
-; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vp.strided.load.nxv2i32.p0.i64(ptr align 4 [[TMP16]], i64 64, <vscale x 2 x i1> [[TMP15]], i32 [[TMP17]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; RV32-NEXT: [[TMP18:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 100)
-; RV32-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[TMP18]], <vscale x 2 x i1> zeroinitializer
-; RV32-NEXT: [[TMP20:%.*]] = shl nuw nsw i64 [[OFFSET_IDX]], 1
-; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP20]]
-; RV32-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP8]] to i32
-; RV32-NEXT: [[WIDE_STRIDED_LOAD11:%.*]] = call <vscale x 2 x double> @llvm.experimental.vp.strided.load.nxv2f64.p0.i64(ptr align 8 [[TMP21]], i64 256, <vscale x 2 x i1> [[TMP19]], i32 [[TMP22]]), !alias.scope [[META5:![0-9]+]]
-; RV32-NEXT: [[TMP23:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_STRIDED_LOAD]] to <vscale x 2 x double>
-; RV32-NEXT: [[TMP24:%.*]] = fadd <vscale x 2 x double> [[WIDE_STRIDED_LOAD11]], [[TMP23]]
-; RV32-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP24]], <vscale x 2 x ptr> align 8 [[TMP25]], <vscale x 2 x i1> [[TMP18]], i32 [[TMP11]]), !alias.scope [[META3]], !noalias [[META5]]
-; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
-; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
+; RV32-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[OFFSET_IDX]]
+; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vp.strided.load.nxv2i32.p0.i64(ptr align 4 [[TMP12]], i64 64, <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; RV32-NEXT: [[TMP13:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 100)
+; RV32-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[OFFSET_IDX]], 1
+; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP14]]
+; RV32-NEXT: [[WIDE_STRIDED_LOAD9:%.*]] = call <vscale x 2 x double> @llvm.experimental.vp.strided.load.nxv2f64.p0.i64(ptr align 8 [[TMP15]], i64 256, <vscale x 2 x i1> [[TMP13]], i32 [[TMP9]]), !alias.scope [[META5:![0-9]+]]
+; RV32-NEXT: [[TMP16:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_STRIDED_LOAD]] to <vscale x 2 x double>
+; RV32-NEXT: [[TMP17:%.*]] = fadd <vscale x 2 x double> [[WIDE_STRIDED_LOAD9]], [[TMP16]]
+; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
+; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP17]], <vscale x 2 x ptr> align 8 [[TMP18]], <vscale x 2 x i1> [[TMP13]], i32 [[TMP9]]), !alias.scope [[META3]], !noalias [[META5]]
+; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[EVL_BASED_IV]]
+; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
; RV32-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; RV32-NEXT: [[TMP26:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RV32-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; RV32-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; RV32: middle.block:
; RV32-NEXT: br label [[FOR_END:%.*]]
; RV32: scalar.ph:
@@ -126,8 +117,6 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV64-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
; RV64-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; RV64: vector.ph:
-; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
; RV64-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
; RV64-NEXT: [[TMP3:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP7]], splat (i64 16)
; RV64-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP3]]
@@ -137,24 +126,17 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV64-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; RV64-NEXT: [[AVL:%.*]] = phi i64 [ 625, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; RV64-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; RV64-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP10]], i64 0
-; RV64-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT6]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
; RV64-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64
; RV64-NEXT: [[TMP11:%.*]] = mul nuw nsw i64 16, [[TMP8]]
; RV64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
; RV64-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; RV64-NEXT: [[TMP16:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; RV64-NEXT: [[TMP25:%.*]] = icmp ult <vscale x 2 x i32> [[TMP16]], [[BROADCAST_SPLAT7]]
; RV64-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 16
; RV64-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[OFFSET_IDX]]
-; RV64-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP1]] to i32
-; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vp.strided.load.nxv2i32.p0.i64(ptr align 4 [[TMP26]], i64 64, <vscale x 2 x i1> [[TMP25]], i32 [[TMP27]]), !alias.scope [[META0:![0-9]+]]
+; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vp.strided.load.nxv2i32.p0.i64(ptr align 4 [[TMP26]], i64 64, <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]]
; RV64-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100)
-; RV64-NEXT: [[TMP12:%.*]] = select <vscale x 2 x i1> [[TMP25]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
; RV64-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[OFFSET_IDX]], 1
; RV64-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP13]]
-; RV64-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP1]] to i32
-; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.experimental.vp.strided.load.nxv2f64.p0.i64(ptr align 8 [[TMP28]], i64 256, <vscale x 2 x i1> [[TMP12]], i32 [[TMP15]]), !alias.scope [[META3:![0-9]+]]
+; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.experimental.vp.strided.load.nxv2f64.p0.i64(ptr align 8 [[TMP28]], i64 256, <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]]
; RV64-NEXT: [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
; RV64-NEXT: [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
index 7503165665642..47aefc736fd44 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
@@ -9,8 +9,6 @@ define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalia
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[C]], i64 0
@@ -20,15 +18,10 @@ define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalia
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ -7905747460161236406, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP2]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT4]]
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[EVL_BASED_IV]], 7
; CHECK-NEXT: [[IV:%.*]] = add i64 1, [[TMP5]]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP1]] to i32
-; CHECK-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr align 1 [[GEP]], i64 7, <vscale x 4 x i1> [[TMP4]], i32 [[TMP7]])
+; CHECK-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr align 1 [[GEP]], i64 7, <vscale x 4 x i1> splat (i1 true), i32 [[TMP2]])
; CHECK-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_STRIDED_LOAD]] to <vscale x 4 x i64>
; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> [[TMP8]], <vscale x 4 x i64> splat (i64 1), i32 [[TMP2]])
; CHECK-NEXT: [[TMP10:%.*]] = sdiv <vscale x 4 x i64> zeroinitializer, [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 2b810e8f476b4..c30b13de57806 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -10,8 +10,6 @@ define void @single_constant_stride_int_scaled(ptr %p) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 1)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
@@ -21,19 +19,14 @@ define void @single_constant_stride_int_scaled(ptr %p) {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT: [[TMP7:%.*]] = icmp ult <vscale x 4 x i32> [[TMP6]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i64 [[EVL_BASED_IV]], 8
; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], splat (i64 8)
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP14]]
-; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP1]] to i32
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 32, <vscale x 4 x i1> [[TMP7]], i32 [[TMP19]])
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 32, <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
@@ -133,8 +126,6 @@ define void @single_constant_stride_int_iv(ptr %p) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP6]], splat (i64 64)
; CHECK-NEXT: [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> zeroinitializer, [[TMP3]]
@@ -144,19 +135,14 @@ define void @single_constant_stride_int_iv(ptr %p) {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i64 64, [[TMP11]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT: [[TMP16:%.*]] = icmp ult <vscale x 4 x i32> [[TMP15]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 64
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP1]] to i32
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP17]], i64 256, <vscale x 4 x i1> [[TMP16]], i32 [[TMP18]])
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP17]], i64 256, <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
@@ -813,8 +799,6 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; STRIDED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; STRIDED: vector.ph:
-; STRIDED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT: [[TMP45:%.*]] = mul nuw i64 [[TMP42]], 4
; STRIDED-NEXT: [[TMP47:%.*]] = mul i64 [[STRIDE]], 4
; STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[STRIDE]], i64 0
; STRIDED-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -827,21 +811,16 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; STRIDED-NEXT: [[TMP43:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; STRIDED-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP43]], i64 0
-; STRIDED-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT11]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; STRIDED-NEXT: [[TMP44:%.*]] = zext i32 [[TMP43]] to i64
; STRIDED-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP44]], i64 0
; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; STRIDED-NEXT: [[TMP48:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; STRIDED-NEXT: [[TMP49:%.*]] = icmp ult <vscale x 4 x i32> [[TMP48]], [[BROADCAST_SPLAT12]]
; STRIDED-NEXT: [[TMP50:%.*]] = mul nuw nsw i64 [[EVL_BASED_IV]], [[STRIDE]]
; STRIDED-NEXT: [[TMP18:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT1]]
-; STRIDED-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP50]]
-; STRIDED-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP45]] to i32
-; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP46]], i64 [[TMP47]], <vscale x 4 x i1> [[TMP49]], i32 [[TMP51]]), !alias.scope [[META5:![0-9]+]]
-; STRIDED-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
-; STRIDED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP18]]
-; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
+; STRIDED-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP50]]
+; STRIDED-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP42]], i64 [[TMP47]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META5:![0-9]+]]
+; STRIDED-NEXT: [[TMP45:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
+; STRIDED-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP18]]
+; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP45]], <vscale x 4 x ptr> align 4 [[TMP46]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
; STRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP44]], [[EVL_BASED_IV]]
; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP44]]
; STRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
@@ -1354,20 +1333,13 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; NOSTRIDED-NEXT: entry:
; NOSTRIDED-NEXT: br label [[VECTOR_PH:%.*]]
; NOSTRIDED: vector.ph:
-; NOSTRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
; NOSTRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
; NOSTRIDED: vector.body:
; NOSTRIDED-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; NOSTRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; NOSTRIDED-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; NOSTRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP2]], i64 0
-; NOSTRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; NOSTRIDED-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; NOSTRIDED-NEXT: [[TMP4:%.*]] = icmp ult <vscale x 2 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
; NOSTRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], i64 [[EVL_BASED_IV]]
-; NOSTRIDED-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP1]] to i32
-; NOSTRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP9]], i64 4, <vscale x 2 x i1> [[TMP4]], i32 [[TMP10]])
+; NOSTRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP9]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
; NOSTRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[EVL_BASED_IV]]
; NOSTRIDED-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
; NOSTRIDED-NEXT: [[TMP11:%.*]] = zext i32 [[TMP2]] to i64
@@ -1434,20 +1406,13 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; STRIDED-NEXT: entry:
; STRIDED-NEXT: br label [[VECTOR_PH:%.*]]
; STRIDED: vector.ph:
-; STRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
; STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
; STRIDED: vector.body:
; STRIDED-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; STRIDED-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP2]], i64 0
-; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; STRIDED-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; STRIDED-NEXT: [[TMP4:%.*]] = icmp ult <vscale x 2 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
; STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], i64 [[EVL_BASED_IV]]
-; STRIDED-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP1]] to i32
-; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP9]], i64 4, <vscale x 2 x i1> [[TMP4]], i32 [[TMP10]])
+; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP9]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
; STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[EVL_BASED_IV]]
; STRIDED-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
; STRIDED-NEXT: [[TMP11:%.*]] = zext i32 [[TMP2]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
index 3be12edbc4a49..7b184f6e02976 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
@@ -12,20 +12,13 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
-; IF-EVL-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY1]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY1]] ]
; IF-EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP2]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; IF-EVL-NEXT: [[TMP4:%.*]] = icmp ult <vscale x 2 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP12]] to i32
-; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP5]], i64 4, <vscale x 2 x i1> [[TMP4]], i32 [[TMP6]])
+; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP5]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <vscale x 2 x i64> [[WIDE_STRIDED_LOAD]]
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x float> @llvm.vp.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <vscale x 2 x i64> [[WIDE_STRIDED_LOAD]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index e6f3ee4e1b2f3..16b367c672cd8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -118,28 +118,19 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
-; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-NEXT: [[TMP7:%.*]] = icmp ult <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[EVL_BASED_IV]], i32 0
-; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP1]] to i32
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, <vscale x 4 x i1> [[TMP7]], i32 [[TMP6]])
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
; IF-EVL-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP1]] to i32
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, <vscale x 4 x i1> [[TMP7]], i32 [[TMP19]])
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
; IF-EVL-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[WIDE_MASKED_GATHER1]]
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[EVL_BASED_IV]], i32 3
-; IF-EVL-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP1]] to i32
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 16, <vscale x 4 x i1> [[TMP7]], i32 [[TMP20]])
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
; IF-EVL-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[WIDE_MASKED_GATHER2]]
; IF-EVL-NEXT: [[TMP12]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP4]])
; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP4]] to i64
@@ -358,28 +349,19 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
-; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-NEXT: [[TMP7:%.*]] = icmp ult <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[EVL_BASED_IV]], i32 0
-; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP1]] to i32
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, <vscale x 4 x i1> [[TMP7]], i32 [[TMP6]])
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
; IF-EVL-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP1]] to i32
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, <vscale x 4 x i1> [[TMP7]], i32 [[TMP19]])
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
; IF-EVL-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[WIDE_MASKED_GATHER1]]
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[EVL_BASED_IV]], i32 2
-; IF-EVL-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP1]] to i32
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 16, <vscale x 4 x i1> [[TMP7]], i32 [[TMP20]])
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
; IF-EVL-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[WIDE_MASKED_GATHER2]]
; IF-EVL-NEXT: [[TMP12]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP4]])
; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP4]] to i64
@@ -591,33 +573,23 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
; IF-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[N]], [[SMIN]]
; IF-EVL-NEXT: br label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
-; IF-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP1]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP6]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult <vscale x 4 x i32> [[TMP5]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; IF-EVL-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i32
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, <vscale x 4 x i1> [[TMP9]], i32 [[TMP8]])
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; IF-EVL-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP3]] to i32
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, <vscale x 4 x i1> [[TMP9]], i32 [[TMP14]])
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[WIDE_MASKED_GATHER3]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 2
-; IF-EVL-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP3]] to i32
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP12]], i64 -16, <vscale x 4 x i1> [[TMP9]], i32 [[TMP24]])
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP12]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER4]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 3
-; IF-EVL-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP3]] to i32
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP25]], i64 -16, <vscale x 4 x i1> [[TMP9]], i32 [[TMP26]])
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP25]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP13]], [[WIDE_MASKED_GATHER5]]
; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP15]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
>From be1b853c9fe18ee1e1e877411ab947f670ff4b4a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 4 Sep 2025 05:42:28 -0700
Subject: [PATCH 05/25] patch planContainsAdditionalSimplifications
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0ec8eb3c13489..f46e07563dc7c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7091,6 +7091,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
RepR->getUnderlyingInstr(), VF))
return true;
}
+
+ // The strided load is transformed from a gather through VPlanTransform,
+ // and its cost will be lower than the original gather.
+ if (isa<VPWidenStridedLoadRecipe>(&R))
+ return true;
+
if (Instruction *UI = GetInstructionForCost(&R)) {
// If we adjusted the predicate of the recipe, the cost in the legacy
// cost model may be different.
>From 7f625be82ef8698807e7ed12ffc5e61588c5ee8c Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 21 Oct 2025 23:23:05 -0700
Subject: [PATCH 06/25] Update comment. nfc
---
llvm/lib/Transforms/Vectorize/VPlan.h | 12 +++++++-----
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 8 ++++++--
2 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e171ed948d24a..4d739749cfc22 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1895,6 +1895,7 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
return VarIdx;
}
+ /// Returns the element type for the first \p I indices of this recipe.
Type *getIndexedType(unsigned I) const {
auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
SmallVector<Value *, 4> Ops(GEP->idx_begin(), GEP->idx_begin() + I);
@@ -1986,8 +1987,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
#endif
};
-/// A recipe to compute the pointers for widened memory accesses of IndexedTy,
-/// with the Stride expressed in units of IndexedTy.
+/// A recipe to compute the pointers for widened memory accesses of
+/// SourceElementTy, with the Stride expressed in units of SourceElementTy.
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<2> {
Type *SourceElementTy;
@@ -3439,9 +3440,9 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
#endif
};
-/// A recipe for strided load operations, using the base address, stride, and an
-/// optional mask. This recipe will generate an vp.strided.load intrinsic call
-/// to represent memory accesses with a fixed stride.
+/// A recipe for strided load operations, using the base address, stride, VF,
+/// and an optional mask. This recipe will generate a vp.strided.load intrinsic
+/// call to represent memory accesses with a fixed stride.
struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
public VPValue {
VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride,
@@ -3481,6 +3482,7 @@ struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
+ // All operands except the mask are only used for the first lane.
return Op == getAddr() || Op == getStride() || Op == getVF();
}
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 42dc9677845b7..df5d539c808cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5138,8 +5138,8 @@ static std::pair<VPValue *, VPValue *> matchStridedStart(VPValue *CurIndex) {
if (Opcode != Instruction::Shl && Opcode != Instruction::Mul)
return {nullptr, nullptr};
- // Match the pattern binop(variant, invariant), or binop(invariant, variant)
- // if the binary operator is commutative.
+ // Match the pattern binop(variant, uniform), or binop(uniform, variant) if
+ // the binary operator is commutative.
bool IsLHSUniform = vputils::isSingleScalar(WidenR->getOperand(0));
if (IsLHSUniform == vputils::isSingleScalar(WidenR->getOperand(1)) ||
(IsLHSUniform && !Instruction::isCommutative(Opcode)))
@@ -5163,6 +5163,10 @@ static std::pair<VPValue *, VPValue *> matchStridedStart(VPValue *CurIndex) {
return {StartR, StrideR};
}
+/// Checks if the given VPWidenGEPRecipe \p WidenGEP represents a strided
+/// access. If so, it creates recipes representing the base pointer and stride
+/// in element type, and returns a tuple of {base pointer, stride, element
+/// type}. Otherwise, returns a tuple where all elements are nullptr.
static std::tuple<VPValue *, VPValue *, Type *>
determineBaseAndStride(VPWidenGEPRecipe *WidenGEP) {
// TODO: Check if the base pointer is strided.
>From 9b2f017f0134459de281cb2ebd4ffe2550080a2a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 22 Oct 2025 02:15:56 -0700
Subject: [PATCH 07/25] Find base pointer and stride after profitable check
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 50 +++++++++----------
1 file changed, 23 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index df5d539c808cc..54cca902dc893 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5206,8 +5206,7 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
VPTypeAnalysis TypeInfo(Plan);
DenseMap<VPWidenGEPRecipe *, std::tuple<VPValue *, VPValue *, Type *>>
StrideCache;
- SmallVector<VPRecipeBase *> ToErase;
- SmallPtrSet<VPValue *, 4> PossiblyDead;
+ SmallVector<VPWidenMemoryRecipe *> ToErase;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
@@ -5230,23 +5229,6 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
if (!PtrUV)
continue;
- // Try to get base and stride here.
- VPValue *BasePtr, *StrideInElement;
- Type *ElementTy;
- auto It = StrideCache.find(Ptr);
- if (It != StrideCache.end())
- std::tie(BasePtr, StrideInElement, ElementTy) = It->second;
- else
- std::tie(BasePtr, StrideInElement, ElementTy) = StrideCache[Ptr] =
- determineBaseAndStride(Ptr);
-
- // Skip if the memory access is not a strided access.
- if (!BasePtr) {
- assert(!StrideInElement && !ElementTy);
- continue;
- }
- assert(StrideInElement && ElementTy);
-
Instruction &Ingredient = MemR->getIngredient();
auto IsProfitable = [&](ElementCount VF) -> bool {
Type *DataTy = toVectorTy(getLoadStoreType(&Ingredient), VF);
@@ -5264,12 +5246,25 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
};
if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable,
- Range)) {
- PossiblyDead.insert(BasePtr);
- PossiblyDead.insert(StrideInElement);
+ Range))
+ continue;
+
+ // Try to get base and stride here.
+ VPValue *BasePtr, *StrideInElement;
+ Type *ElementTy;
+ auto It = StrideCache.find(Ptr);
+ if (It != StrideCache.end())
+ std::tie(BasePtr, StrideInElement, ElementTy) = It->second;
+ else
+ std::tie(BasePtr, StrideInElement, ElementTy) = StrideCache[Ptr] =
+ determineBaseAndStride(Ptr);
+
+ // Skip if the memory access is not a strided access.
+ if (!BasePtr) {
+ assert(!StrideInElement && !ElementTy);
continue;
}
- PossiblyDead.insert(Ptr);
+ assert(StrideInElement && ElementTy);
// Create a new vector pointer for strided access.
auto *GEP = dyn_cast<GetElementPtrInst>(PtrUV->stripPointerCasts());
@@ -5304,9 +5299,10 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
}
}
- // Clean up dead memory access recipes, and unused base address and stride.
- for (auto *R : ToErase)
+ // Clean up dead recipes.
+ for (auto *R : ToErase) {
+ VPValue *Addr = R->getAddr();
R->eraseFromParent();
- for (auto *V : PossiblyDead)
- recursivelyDeleteDeadRecipes(V);
+ recursivelyDeleteDeadRecipes(Addr);
+ }
}
>From 1ac6f6d4b6c87d5af44e965f059db8039cb85fb0 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 22 Oct 2025 02:43:30 -0700
Subject: [PATCH 08/25] Replace getUnderlyingValue() with getUnderlyingInstr()
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 54cca902dc893..fb8a00e775a66 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5130,7 +5130,7 @@ static std::pair<VPValue *, VPValue *> matchStridedStart(VPValue *CurIndex) {
return {WidenIV, WidenIV->getStepValue()};
auto *WidenR = dyn_cast<VPWidenRecipe>(CurIndex);
- if (!WidenR || !CurIndex->getUnderlyingValue())
+ if (!WidenR || !WidenR->getUnderlyingInstr())
return {nullptr, nullptr};
unsigned Opcode = WidenR->getOpcode();
>From ff075e9a845c950d5aa7a126c55bbd7cc2d6548e Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 22 Oct 2025 03:11:24 -0700
Subject: [PATCH 09/25] Directly use flags from WidenGEP
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index fb8a00e775a66..3304b574ff5e2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5267,10 +5267,8 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
assert(StrideInElement && ElementTy);
// Create a new vector pointer for strided access.
- auto *GEP = dyn_cast<GetElementPtrInst>(PtrUV->stripPointerCasts());
auto *NewPtr = new VPVectorPointerRecipe(
- BasePtr, ElementTy, StrideInElement,
- GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
+ BasePtr, ElementTy, StrideInElement, Ptr->getGEPNoWrapFlags(),
Ptr->getDebugLoc());
NewPtr->insertBefore(MemR);
>From 2f58bb845edb1e2f24720532f00f297038ce5897 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 22 Oct 2025 03:29:02 -0700
Subject: [PATCH 10/25] Pass Ptr->getUnderlyingValue() in memory cost model
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 11 +++--------
1 file changed, 3 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3304b574ff5e2..7cae4183f2154 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5223,12 +5223,6 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
if (!Ptr)
continue;
- // Memory cost model requires the pointer operand of memory access
- // instruction.
- Value *PtrUV = Ptr->getUnderlyingValue();
- if (!PtrUV)
- continue;
-
Instruction &Ingredient = MemR->getIngredient();
auto IsProfitable = [&](ElementCount VF) -> bool {
Type *DataTy = toVectorTy(getLoadStoreType(&Ingredient), VF);
@@ -5239,8 +5233,9 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
const InstructionCost StridedLoadStoreCost =
Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(
- Intrinsic::experimental_vp_strided_load, DataTy, PtrUV,
- MemR->isMasked(), Alignment, &Ingredient),
+ Intrinsic::experimental_vp_strided_load, DataTy,
+ Ptr->getUnderlyingValue(), MemR->isMasked(), Alignment,
+ &Ingredient),
Ctx.CostKind);
return StridedLoadStoreCost < CurrentCost;
};
>From 77cc4c4acd8bc91fa0e2134b1cdb2b66398cdb20 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 22 Oct 2025 06:30:47 -0700
Subject: [PATCH 11/25] Implement VPWidenStridedLoadRecipe::computeCost
---
llvm/lib/Transforms/Vectorize/VPlan.h | 4 +++
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 26 +++++++++++--------
2 files changed, 19 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4d739749cfc22..e8ef8a9da5584 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3472,6 +3472,10 @@ struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
/// Generate a strided load.
void execute(VPTransformState &State) override;
+ /// Return the cost of this VPWidenStridedLoadRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index de9d627e15ef1..897aa7fe243f7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3484,11 +3484,9 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
- unsigned Opcode =
- isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
- this)
- ? Instruction::Load
- : Instruction::Store;
+ unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
+ ? Instruction::Load
+ : Instruction::Store;
if (!Consecutive) {
// TODO: Using the original IR may not be accurate.
@@ -3498,12 +3496,6 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
"Inconsecutive memory access should not have the order.");
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
- if (isa<VPWidenStridedLoadRecipe>(this))
- return Ctx.TTI.getMemIntrinsicInstrCost(
- MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
- Ty, Ptr, IsMasked, Alignment, &Ingredient),
- Ctx.CostKind);
-
Type *PtrTy = Ptr->getType();
// If the address value is uniform across all lanes, then the address can be
// calculated with scalar type and broadcast.
@@ -3693,6 +3685,18 @@ void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
State.set(this, NewLI);
}
+InstructionCost
+VPWidenStridedLoadRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
+ const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+ return Ctx.TTI.getMemIntrinsicInstrCost(
+ MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load, Ty,
+ Ptr, IsMasked, Alignment, &Ingredient),
+ Ctx.CostKind);
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenStridedLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
>From 75f518c54f90ad0efd58c77c612715652773a1e9 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 7 Nov 2025 00:48:22 -0800
Subject: [PATCH 12/25] update comments. nfc
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7cae4183f2154..d1bc32c3d2aae 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5165,7 +5165,7 @@ static std::pair<VPValue *, VPValue *> matchStridedStart(VPValue *CurIndex) {
/// Checks if the given VPWidenGEPRecipe \p WidenGEP represents a strided
/// access. If so, it creates recipes representing the base pointer and stride
-/// in element type, and returns a tuple of {base pointer, stride, element
+/// in element units, and returns a tuple of {base pointer, stride, element
/// type}. Otherwise, returns a tuple where all elements are nullptr.
static std::tuple<VPValue *, VPValue *, Type *>
determineBaseAndStride(VPWidenGEPRecipe *WidenGEP) {
>From cec7a70346bce8a71bfa4acaf490790e4142b362 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 7 Nov 2025 01:11:31 -0800
Subject: [PATCH 13/25] cast to VPWidenLoadRecipe directly for now. nfc
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d1bc32c3d2aae..863955abad75d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5210,31 +5210,31 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R);
+ auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
// TODO: Support strided store.
// TODO: Transform reverse access into strided access with -1 stride.
// TODO: Transform gather/scatter with uniform address into strided access
// with 0 stride.
// TODO: Transform interleave access into multiple strided accesses.
- if (!MemR || !isa<VPWidenLoadRecipe>(MemR) || MemR->isConsecutive())
+ if (!LoadR || LoadR->isConsecutive())
continue;
- auto *Ptr = dyn_cast<VPWidenGEPRecipe>(MemR->getAddr());
+ auto *Ptr = dyn_cast<VPWidenGEPRecipe>(LoadR->getAddr());
if (!Ptr)
continue;
- Instruction &Ingredient = MemR->getIngredient();
+ Instruction &Ingredient = LoadR->getIngredient();
auto IsProfitable = [&](ElementCount VF) -> bool {
Type *DataTy = toVectorTy(getLoadStoreType(&Ingredient), VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
return false;
- const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
+ const InstructionCost CurrentCost = LoadR->computeCost(VF, Ctx);
const InstructionCost StridedLoadStoreCost =
Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(
Intrinsic::experimental_vp_strided_load, DataTy,
- Ptr->getUnderlyingValue(), MemR->isMasked(), Alignment,
+ Ptr->getUnderlyingValue(), LoadR->isMasked(), Alignment,
&Ingredient),
Ctx.CostKind);
return StridedLoadStoreCost < CurrentCost;
@@ -5265,7 +5265,7 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
auto *NewPtr = new VPVectorPointerRecipe(
BasePtr, ElementTy, StrideInElement, Ptr->getGEPNoWrapFlags(),
Ptr->getDebugLoc());
- NewPtr->insertBefore(MemR);
+ NewPtr->insertBefore(LoadR);
const DataLayout &DL = Ingredient.getDataLayout();
TypeSize TS = DL.getTypeAllocSize(ElementTy);
@@ -5277,11 +5277,10 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
TypeInfo.inferScalarType(StrideInElement), TypeScale));
auto *ScaledStride =
new VPInstruction(Instruction::Mul, {StrideInElement, ScaleVPV});
- ScaledStride->insertBefore(MemR);
+ ScaledStride->insertBefore(LoadR);
StrideInBytes = ScaledStride;
}
- auto *LoadR = cast<VPWidenLoadRecipe>(MemR);
auto *StridedLoad = new VPWidenStridedLoadRecipe(
*cast<LoadInst>(&Ingredient), NewPtr, StrideInBytes, &Plan.getVF(),
LoadR->getMask(), *LoadR, LoadR->getDebugLoc());
>From a9ea14a93092378854d7cdd5a8b9a27258c2df79 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 7 Nov 2025 01:14:10 -0800
Subject: [PATCH 14/25] replace getOrAddLiveIn(ConstantInt::get()) with
getConstantInt. nfc
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f46e07563dc7c..197bfe05ee29d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7673,7 +7673,7 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
} else {
const DataLayout &DL = I->getDataLayout();
auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType());
- VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1));
+ VPValue *StrideOne = Plan.getConstantInt(StrideTy, 1);
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne,
GEP ? GEP->getNoWrapFlags()
: GEPNoWrapFlags::none(),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 863955abad75d..a406503678437 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5273,8 +5273,8 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
VPValue *StrideInBytes = StrideInElement;
// Scale the stride by the size of the indexed type.
if (TypeScale != 1) {
- VPValue *ScaleVPV = Plan.getOrAddLiveIn(ConstantInt::get(
- TypeInfo.inferScalarType(StrideInElement), TypeScale));
+ VPValue *ScaleVPV = Plan.getConstantInt(
+ TypeInfo.inferScalarType(StrideInElement), TypeScale);
auto *ScaledStride =
new VPInstruction(Instruction::Mul, {StrideInElement, ScaleVPV});
ScaledStride->insertBefore(LoadR);
>From 724ea9541e95cf122bf372ac7ce8881b9b4f2873 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 7 Nov 2025 01:28:12 -0800
Subject: [PATCH 15/25] updated assertion message. nfc
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a406503678437..02853f77db258 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5255,11 +5255,10 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
determineBaseAndStride(Ptr);
// Skip if the memory access is not a strided access.
- if (!BasePtr) {
- assert(!StrideInElement && !ElementTy);
+ if (!BasePtr)
continue;
- }
- assert(StrideInElement && ElementTy);
+ assert(StrideInElement && ElementTy &&
+ "Can not get stride information for a strided access");
// Create a new vector pointer for strided access.
auto *NewPtr = new VPVectorPointerRecipe(
>From ea95c95737fffbeac52a44c1e3be3a2dd9dca1e1 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 10 Nov 2025 23:23:08 -0800
Subject: [PATCH 16/25] rebase and remove getLoadStoreAlignment call
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 897aa7fe243f7..63f08c61dc375 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3661,7 +3661,6 @@ void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
- const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
Value *Addr = State.get(getAddr(), /*IsScalar*/ true);
@@ -3690,7 +3689,6 @@ VPWidenStridedLoadRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
- const Align Alignment = getLoadStoreAlignment(&Ingredient);
return Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load, Ty,
Ptr, IsMasked, Alignment, &Ingredient),
>From d511257cc24c0028bc9bbcb3dac7962be2e31b53 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 2 Dec 2025 07:38:14 -0800
Subject: [PATCH 17/25] Intro VPWidenMemIntrinsicRecipe
---
llvm/lib/Analysis/VectorUtils.cpp | 4 +
.../Transforms/Vectorize/LoopVectorize.cpp | 1 +
llvm/lib/Transforms/Vectorize/VPlan.h | 95 +++++++++++++
.../Transforms/Vectorize/VPlanAnalysis.cpp | 2 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 125 ++++++++++++++++++
.../Transforms/Vectorize/VPlanTransforms.cpp | 47 +++++--
llvm/lib/Transforms/Vectorize/VPlanValue.h | 2 +
.../Transforms/Vectorize/VPlanVerifier.cpp | 7 +-
.../LoopVectorize/RISCV/strided-accesses.ll | 25 ++--
.../RISCV/tail-folding-gather-scatter.ll | 2 +-
.../RISCV/tail-folding-interleave.ll | 27 ++--
11 files changed, 292 insertions(+), 45 deletions(-)
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index a3e9b039f9225..075dd82983181 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -175,6 +175,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
return (ScalarOpdIdx == 2);
case Intrinsic::experimental_vp_splice:
return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
+ case Intrinsic::experimental_vp_strided_load:
+ return ScalarOpdIdx == 0 || ScalarOpdIdx == 1;
default:
return false;
}
@@ -212,6 +214,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
case Intrinsic::powi:
case Intrinsic::ldexp:
return OpdIdx == -1 || OpdIdx == 1;
+ case Intrinsic::experimental_vp_strided_load:
+ return OpdIdx == -1 || OpdIdx == 0 || OpdIdx == 1;
default:
return OpdIdx == -1;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 197bfe05ee29d..2d8588e8d062a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4113,6 +4113,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenCastSC:
case VPDef::VPWidenGEPSC:
case VPDef::VPWidenIntrinsicSC:
+ case VPDef::VPWidenMemIntrinsicSC:
case VPDef::VPWidenSC:
case VPDef::VPWidenSelectSC:
case VPDef::VPBlendSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e8ef8a9da5584..c8cdae13e58b5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -572,6 +572,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInterleaveEVLSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
+ case VPRecipeBase::VPWidenMemIntrinsicSC:
case VPRecipeBase::VPWidenStridedLoadSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
@@ -1698,6 +1699,98 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
#endif
};
+/// A recipe for widening vector memory intrinsics.
+class VPWidenMemIntrinsicRecipe : public VPRecipeBase, public VPIRMetadata {
+ Instruction &Ingredient;
+
+ /// Alignment information for this memory access.
+ Align Alignment;
+
+ /// ID of the vector intrinsic to widen.
+ Intrinsic::ID VectorIntrinsicID;
+
+ /// Scalar return type of the intrinsic.
+ Type *ResultTy;
+
+ /// True if the intrinsic may read from memory.
+ bool MayReadFromMemory;
+
+ /// True if the intrinsic may read write to memory.
+ bool MayWriteToMemory;
+
+ /// True if the intrinsic may have side-effects.
+ bool MayHaveSideEffects;
+
+public:
+ // TODO: support StoreInst for strided store
+ VPWidenMemIntrinsicRecipe(LoadInst &LI, Intrinsic::ID VectorIntrinsicID,
+ ArrayRef<VPValue *> CallArguments,
+ const VPIRMetadata &MD = {},
+ DebugLoc DL = DebugLoc::getUnknown())
+ : VPRecipeBase(VPDef::VPWidenMemIntrinsicSC, CallArguments, DL),
+ VPIRMetadata(MD), Ingredient(LI), Alignment(LI.getAlign()),
+ VectorIntrinsicID(VectorIntrinsicID), ResultTy(LI.getType()),
+ MayReadFromMemory(LI.mayReadFromMemory()),
+ MayWriteToMemory(LI.mayWriteToMemory()),
+ MayHaveSideEffects(LI.mayHaveSideEffects()) {
+ new VPValue(&LI, this);
+ }
+
+ ~VPWidenMemIntrinsicRecipe() override = default;
+
+ VPWidenMemIntrinsicRecipe *clone() override {
+ return new VPWidenMemIntrinsicRecipe(*cast<LoadInst>(&Ingredient),
+ VectorIntrinsicID, operands(), *this,
+ getDebugLoc());
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenMemIntrinsicSC)
+
+ /// Produce a widened version of the vector memory intrinsic.
+ void execute(VPTransformState &State) override;
+
+ /// Return the cost of this vector memory intrinsic.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+ /// Return the ID of the intrinsic.
+ Intrinsic::ID getVectorIntrinsicID() const { return VectorIntrinsicID; }
+
+ /// Return the scalar return type of the intrinsic.
+ Type *getResultType() const { return ResultTy; }
+
+ /// Return to name of the intrinsic as string.
+ StringRef getIntrinsicName() const {
+ return Intrinsic::getBaseName(VectorIntrinsicID);
+ }
+
+ /// Returns true if the intrinsic may read from memory.
+ bool mayReadFromMemory() const { return MayReadFromMemory; }
+
+ /// Returns true if the intrinsic may write to memory.
+ bool mayWriteToMemory() const { return MayWriteToMemory; }
+
+ /// Returns true if the intrinsic may have side-effects.
+ bool mayHaveSideEffects() const { return MayHaveSideEffects; }
+
+ unsigned getMemoryPointerParamPos() const;
+
+ unsigned getMaskParamPos() const;
+
+ void setMask(VPValue *Mask) { setOperand(getMaskParamPos(), Mask); }
+
+ VPValue *getMask() const { return getOperand(getMaskParamPos()); }
+
+ bool usesFirstLaneOnly(const VPValue *Op) const override;
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void printRecipe(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for widening Call instructions using library calls.
class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags,
public VPIRMetadata {
@@ -3982,6 +4075,8 @@ static inline auto castToVPIRMetadata(RecipeBasePtrTy R) -> DstTy {
return cast<VPWidenCastRecipe>(R);
case VPDef::VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(R);
+ case VPDef::VPWidenMemIntrinsicSC:
+ return cast<VPWidenMemIntrinsicRecipe>(R);
case VPDef::VPWidenCallSC:
return cast<VPWidenCallRecipe>(R);
case VPDef::VPWidenSelectSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f44b6ccf8c35a..2e149cdec7e16 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -300,7 +300,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
})
// VPInstructionWithType must be handled before VPInstruction.
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
- VPWidenCastRecipe>(
+ VPWidenMemIntrinsicRecipe, VPWidenCastRecipe>(
[](const auto *R) { return R->getResultType(); })
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 63f08c61dc375..29ee35f6c4441 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -72,6 +72,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
->onlyReadsMemory();
case VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
+ case VPWidenMemIntrinsicSC:
+ return cast<VPWidenMemIntrinsicRecipe>(this)->mayWriteToMemory();
case VPCanonicalIVPHISC:
case VPBranchOnMaskSC:
case VPDerivedIVSC:
@@ -126,6 +128,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
->onlyWritesMemory();
case VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
+ case VPWidenMemIntrinsicSC:
+ return cast<VPWidenMemIntrinsicRecipe>(this)->mayReadFromMemory();
case VPBranchOnMaskSC:
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
@@ -180,6 +184,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
}
case VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
+ case VPWidenMemIntrinsicSC:
+ return cast<VPWidenMemIntrinsicRecipe>(this)->mayHaveSideEffects();
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
@@ -1822,6 +1828,125 @@ void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
}
#endif
+unsigned VPWidenMemIntrinsicRecipe::getMemoryPointerParamPos() const {
+ if (auto Pos = VPIntrinsic::getMemoryPointerParamPos(VectorIntrinsicID))
+ return *Pos;
+
+ switch (VectorIntrinsicID) {
+ case Intrinsic::masked_load:
+ case Intrinsic::masked_gather:
+ case Intrinsic::masked_expandload:
+ return 0;
+ case Intrinsic::masked_store:
+ case Intrinsic::masked_scatter:
+ case Intrinsic::masked_compressstore:
+ return 1;
+ default:
+ llvm_unreachable("unknown vector memory intrinsic");
+ }
+}
+
+unsigned VPWidenMemIntrinsicRecipe::getMaskParamPos() const {
+ if (auto Pos = VPIntrinsic::getMaskParamPos(VectorIntrinsicID))
+ return *Pos;
+
+ switch (VectorIntrinsicID) {
+ case Intrinsic::masked_load:
+ case Intrinsic::masked_gather:
+ case Intrinsic::masked_expandload:
+ return 1;
+ case Intrinsic::masked_store:
+ case Intrinsic::masked_scatter:
+ case Intrinsic::masked_compressstore:
+ return 2;
+ default:
+ llvm_unreachable("unknown vector memory intrinsic");
+ }
+}
+
+void VPWidenMemIntrinsicRecipe::execute(VPTransformState &State) {
+ assert(State.VF.isVector() && "not widening");
+
+ SmallVector<Type *, 2> TysForDecl;
+ // Add return type if intrinsic is overloaded on it.
+ if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
+ TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
+ SmallVector<Value *, 4> Args;
+ for (const auto &I : enumerate(operands())) {
+ // Some intrinsics have a scalar argument - don't replace it with a
+ // vector.
+ Value *Arg;
+ if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
+ State.TTI))
+ Arg = State.get(I.value(), VPLane(0));
+ else
+ Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
+ if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
+ State.TTI))
+ TysForDecl.push_back(Arg->getType());
+ Args.push_back(Arg);
+ }
+
+ // Use vector version of the intrinsic.
+ Module *M = State.Builder.GetInsertBlock()->getModule();
+ Function *VectorF =
+ Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
+ assert(VectorF &&
+ "Can't retrieve vector intrinsic or vector-predication intrinsics.");
+
+ CallInst *MemI = State.Builder.CreateCall(VectorF, Args);
+ MemI->addParamAttr(
+ getMemoryPointerParamPos(),
+ Attribute::getWithAlignment(MemI->getContext(), Alignment));
+ applyMetadata(*MemI);
+
+ if (!MemI->getType()->isVoidTy())
+ State.set(getVPSingleValue(), MemI);
+}
+
+InstructionCost
+VPWidenMemIntrinsicRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
+ const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+ return Ctx.TTI.getMemIntrinsicInstrCost(
+ MemIntrinsicCostAttributes(VectorIntrinsicID, Ty, Ptr,
+ match(getMask(), m_True()), Alignment,
+ &Ingredient),
+ Ctx.CostKind);
+}
+
+// Copy from VPWidenIntrinsicRecipe::usesFirstLaneOnly.
+bool VPWidenMemIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
+ assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
+ return all_of(enumerate(operands()), [this, &Op](const auto &X) {
+ auto [Idx, V] = X;
+ return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
+ Idx, nullptr);
+ });
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenMemIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN-MEM-INTRINSIC ";
+ if (ResultTy->isVoidTy()) {
+ O << "void ";
+ } else {
+ getVPSingleValue()->printAsOperand(O, SlotTracker);
+ O << " = ";
+ }
+
+ O << "call ";
+ O << getIntrinsicName() << "(";
+
+ interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
+ Op->printAsOperand(O, SlotTracker);
+ });
+ O << ")";
+}
+#endif
+
void VPHistogramRecipe::execute(VPTransformState &State) {
IRBuilderBase &Builder = State.Builder;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 02853f77db258..fb51cbf915a63 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2797,6 +2797,18 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
AdjustEndPtr(EndPtr), EVL, Mask);
+ if (auto *MI = dyn_cast<VPWidenMemIntrinsicRecipe>(&CurRecipe))
+ if (MI->getVectorIntrinsicID() == Intrinsic::experimental_vp_strided_load &&
+ match(MI->getMask(), m_RemoveMask(HeaderMask, Mask))) {
+ VPWidenMemIntrinsicRecipe *NewMI = MI->clone();
+ if (Mask)
+ NewMI->setMask(Mask);
+ else
+ NewMI->setMask(Plan->getTrue());
+ NewMI->setOperand(3, &EVL);
+ return NewMI;
+ }
+
if (auto *StridedL = dyn_cast<VPWidenStridedLoadRecipe>(&CurRecipe))
if (StridedL->isMasked() &&
match(StridedL->getMask(), m_RemoveMask(HeaderMask, Mask)))
@@ -2861,8 +2873,13 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
assert(
all_of(
Plan.getVF().users(),
- IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
- VPWidenIntOrFpInductionRecipe, VPWidenStridedLoadRecipe>) &&
+ [&LoopRegion](VPUser *U) {
+ auto *R = cast<VPRecipeBase>(U);
+ return (R->getParent()->getParent() != LoopRegion) ||
+ isa<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
+ VPWidenIntOrFpInductionRecipe, VPWidenStridedLoadRecipe>(
+ R);
+ }) &&
"User of VF that we can't transform to EVL.");
Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
@@ -2960,8 +2977,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
"New recipe must define the same number of values as the "
"original.");
EVLRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe,
- VPInterleaveEVLRecipe>(EVLRecipe)) {
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPWidenMemIntrinsicRecipe,
+ VPWidenStridedLoadRecipe, VPInterleaveEVLRecipe>(EVLRecipe)) {
for (unsigned I = 0; I < NumDefVal; ++I) {
VPValue *CurVPV = CurRecipe->getVPValue(I);
CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
@@ -5207,6 +5224,7 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
DenseMap<VPWidenGEPRecipe *, std::tuple<VPValue *, VPValue *, Type *>>
StrideCache;
SmallVector<VPWidenMemoryRecipe *> ToErase;
+ VPValue *I32VF = nullptr;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
@@ -5260,6 +5278,14 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
assert(StrideInElement && ElementTy &&
"Can not get stride information for a strided access");
+ // Add VF of i32 version for EVL.
+ if (!I32VF) {
+ VPBuilder Builder(Plan.getVectorPreheader());
+ I32VF = Builder.createScalarZExtOrTrunc(
+ &Plan.getVF(), Type::getInt32Ty(Plan.getContext()),
+ TypeInfo.inferScalarType(&Plan.getVF()), DebugLoc::getUnknown());
+ }
+
// Create a new vector pointer for strided access.
auto *NewPtr = new VPVectorPointerRecipe(
BasePtr, ElementTy, StrideInElement, Ptr->getGEPNoWrapFlags(),
@@ -5280,11 +5306,16 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
StrideInBytes = ScaledStride;
}
- auto *StridedLoad = new VPWidenStridedLoadRecipe(
- *cast<LoadInst>(&Ingredient), NewPtr, StrideInBytes, &Plan.getVF(),
- LoadR->getMask(), *LoadR, LoadR->getDebugLoc());
+ VPValue *Mask;
+ if (VPValue *LoadMask = LoadR->getMask())
+ Mask = LoadMask;
+ else
+ Mask = Plan.getTrue();
+ auto *StridedLoad = new VPWidenMemIntrinsicRecipe(
+ *cast<LoadInst>(&Ingredient), Intrinsic::experimental_vp_strided_load,
+ {NewPtr, StrideInBytes, Mask, I32VF}, *LoadR, LoadR->getDebugLoc());
StridedLoad->insertBefore(LoadR);
- LoadR->replaceAllUsesWith(StridedLoad);
+ LoadR->replaceAllUsesWith(StridedLoad->getVPSingleValue());
ToErase.push_back(LoadR);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index ae946836170ba..a1224ee5081c8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -51,6 +51,7 @@ class LLVM_ABI_FOR_TEST VPValue {
friend class VPInterleaveBase;
friend class VPlan;
friend class VPExpressionRecipe;
+ friend class VPWidenMemIntrinsicRecipe;
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
@@ -358,6 +359,7 @@ class VPDef {
VPWidenCastSC,
VPWidenGEPSC,
VPWidenIntrinsicSC,
+ VPWidenMemIntrinsicSC,
VPWidenStridedLoadSC,
VPWidenLoadEVLSC,
VPWidenLoadSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index a36aee96e4154..7f1ca468e5ac2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -158,9 +158,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
};
return all_of(EVL.users(), [this, &VerifyEVLUse](VPUser *U) {
return TypeSwitch<const VPUser *, bool>(U)
- .Case<VPWidenIntrinsicRecipe>([&](const VPWidenIntrinsicRecipe *S) {
- return VerifyEVLUse(*S, S->getNumOperands() - 1);
- })
+ .Case<VPWidenIntrinsicRecipe, VPWidenMemIntrinsicRecipe>(
+ [&](const VPRecipeBase *R) {
+ return VerifyEVLUse(*R, R->getNumOperands() - 1);
+ })
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe,
VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe,
VPWidenStridedLoadRecipe>(
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index c30b13de57806..27b700c3d0af4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -55,6 +55,7 @@ define void @single_constant_stride_int_scaled(ptr %p) {
; CHECK-UF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; CHECK-UF2-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 [[N_MOD_VF]]
; CHECK-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[TMP6]]
+; CHECK-UF2-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP3]] to i32
; CHECK-UF2-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-UF2-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], splat (i64 1)
; CHECK-UF2-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
@@ -73,10 +74,8 @@ define void @single_constant_stride_int_scaled(ptr %p) {
; CHECK-UF2-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP23]], 2
; CHECK-UF2-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 8
; CHECK-UF2-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP17]]
-; CHECK-UF2-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP3]] to i32
; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP22]], i64 32, <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
-; CHECK-UF2-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP3]] to i32
-; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 32, <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
+; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 32, <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; CHECK-UF2-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; CHECK-UF2-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], splat (i32 1)
; CHECK-UF2-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true))
@@ -170,6 +169,7 @@ define void @single_constant_stride_int_iv(ptr %p) {
; CHECK-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
; CHECK-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-UF2-NEXT: [[TMP5:%.*]] = mul i64 [[N_VEC]], 64
+; CHECK-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP3]] to i32
; CHECK-UF2-NEXT: [[TMP6:%.*]] = mul <vscale x 4 x i64> [[BROADCAST_SPLAT]], splat (i64 64)
; CHECK-UF2-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-UF2-NEXT: [[TMP8:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP7]], splat (i64 64)
@@ -187,10 +187,8 @@ define void @single_constant_stride_int_iv(ptr %p) {
; CHECK-UF2-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 2
; CHECK-UF2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP20]], 64
; CHECK-UF2-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP14]]
-; CHECK-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP3]] to i32
; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 256, <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
-; CHECK-UF2-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP3]] to i32
-; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP15]], i64 256, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
+; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP15]], i64 256, <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; CHECK-UF2-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; CHECK-UF2-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], splat (i32 1)
; CHECK-UF2-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true))
@@ -907,6 +905,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-UF2-NEXT: [[TMP30:%.*]] = mul nuw i64 [[TMP29]], 2
; STRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP30]]
; STRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; STRIDED-UF2-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP29]] to i32
; STRIDED-UF2-NEXT: [[TMP35:%.*]] = mul i64 [[STRIDE]], 4
; STRIDED-UF2-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[STRIDE]], i64 0
; STRIDED-UF2-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT10]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -926,10 +925,8 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-UF2-NEXT: [[TMP46:%.*]] = shl nuw i64 [[TMP45]], 2
; STRIDED-UF2-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], [[STRIDE]]
; STRIDED-UF2-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[TMP44]], i64 [[TMP47]]
-; STRIDED-UF2-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP29]] to i32
; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP44]], i64 [[TMP35]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP42]]), !alias.scope [[META8:![0-9]+]]
-; STRIDED-UF2-NEXT: [[TMP43:%.*]] = trunc i64 [[TMP29]] to i32
-; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP48]], i64 [[TMP35]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META8]]
+; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP48]], i64 [[TMP35]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP42]]), !alias.scope [[META8]]
; STRIDED-UF2-NEXT: [[TMP37:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; STRIDED-UF2-NEXT: [[TMP38:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER12]], splat (i32 1)
; STRIDED-UF2-NEXT: [[TMP39:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP33]]
@@ -1364,6 +1361,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; NOSTRIDED-UF2-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
; NOSTRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
; NOSTRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; NOSTRIDED-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP3]] to i32
; NOSTRIDED-UF2-NEXT: br label [[VECTOR_BODY:%.*]]
; NOSTRIDED-UF2: vector.body:
; NOSTRIDED-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1371,10 +1369,8 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; NOSTRIDED-UF2-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; NOSTRIDED-UF2-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 1
; NOSTRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i64 [[TMP7]]
-; NOSTRIDED-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP3]] to i32
; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP5]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP15]])
-; NOSTRIDED-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP3]] to i32
-; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP8]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP16]])
+; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP8]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP15]])
; NOSTRIDED-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[INDEX]]
; NOSTRIDED-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
; NOSTRIDED-UF2-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
@@ -1437,6 +1433,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; STRIDED-UF2-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
; STRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
; STRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; STRIDED-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP3]] to i32
; STRIDED-UF2-NEXT: br label [[VECTOR_BODY:%.*]]
; STRIDED-UF2: vector.body:
; STRIDED-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1444,10 +1441,8 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; STRIDED-UF2-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; STRIDED-UF2-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 1
; STRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i64 [[TMP7]]
-; STRIDED-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP3]] to i32
; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP5]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP15]])
-; STRIDED-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP3]] to i32
-; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP8]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP16]])
+; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP8]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP15]])
; STRIDED-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[INDEX]]
; STRIDED-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
; STRIDED-UF2-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
index 7b184f6e02976..805d52bf7570f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
@@ -44,11 +44,11 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde
; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
; NO-VP-NEXT: br label [[FOR_BODY1:%.*]]
; NO-VP: vector.body:
; NO-VP-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ]
; NO-VP-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV1]]
-; NO-VP-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
; NO-VP-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[ARRAYIDX3]], i64 4, <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <vscale x 2 x i64> [[WIDE_STRIDED_LOAD]]
; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP6]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x float> poison)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index 16b367c672cd8..bdf32dd12e474 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -157,20 +157,18 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
; NO-VP-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; NO-VP-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[TMP5]]
+; NO-VP-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP3]] to i32
; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-VP: vector.body:
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[INDEX]], i32 0
-; NO-VP-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP3]] to i32
; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; NO-VP-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; NO-VP-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP3]] to i32
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; NO-VP-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[WIDE_MASKED_GATHER1]]
; NO-VP-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[INDEX]], i32 3
-; NO-VP-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP3]] to i32
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP19]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP19]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; NO-VP-NEXT: [[TMP13]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; NO-VP-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -388,20 +386,18 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
; NO-VP-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; NO-VP-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[TMP5]]
+; NO-VP-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP3]] to i32
; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-VP: vector.body:
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[INDEX]], i32 0
-; NO-VP-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP3]] to i32
; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; NO-VP-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; NO-VP-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP3]] to i32
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; NO-VP-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[WIDE_MASKED_GATHER1]]
; NO-VP-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[INDEX]], i32 2
-; NO-VP-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP3]] to i32
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP19]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP19]], i64 16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; NO-VP-NEXT: [[TMP13]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; NO-VP-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -618,25 +614,22 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP5]] to i32
; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-VP: vector.body:
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
; NO-VP-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; NO-VP-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP5]] to i32
; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
; NO-VP-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; NO-VP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
; NO-VP-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER3]]
; NO-VP-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 2
-; NO-VP-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP5]] to i32
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP23]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP13]])
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP23]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
; NO-VP-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[TMP12]], [[WIDE_MASKED_GATHER4]]
; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 3
-; NO-VP-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP5]] to i32
-; NO-VP-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP15]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP24]])
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP15]], i64 -16, <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
; NO-VP-NEXT: [[TMP16]] = add <vscale x 4 x i32> [[TMP14]], [[WIDE_MASKED_GATHER5]]
; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; NO-VP-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
>From e70ab6f7fce03c3b9fb4bb846f9219c8048ea625 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 8 Dec 2025 17:52:55 -0800
Subject: [PATCH 18/25] Inherit from VPWidenIntrinsicRecipe
---
llvm/lib/Transforms/Vectorize/VPlan.h | 127 ++++++++----------
.../Transforms/Vectorize/VPlanAnalysis.cpp | 2 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 104 ++++----------
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 -
.../Transforms/Vectorize/VPlanVerifier.cpp | 7 +-
5 files changed, 83 insertions(+), 158 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c8cdae13e58b5..d3e6e4e25273b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -556,6 +556,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenCastSC:
case VPRecipeBase::VPWidenGEPSC:
case VPRecipeBase::VPWidenIntrinsicSC:
+ case VPRecipeBase::VPWidenMemIntrinsicSC:
case VPRecipeBase::VPWidenSC:
case VPRecipeBase::VPWidenSelectSC:
case VPRecipeBase::VPBlendSC:
@@ -572,7 +573,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInterleaveEVLSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
- case VPRecipeBase::VPWidenMemIntrinsicSC:
case VPRecipeBase::VPWidenStridedLoadSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
@@ -935,6 +935,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
R->getVPDefID() == VPRecipeBase::VPWidenCallSC ||
R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
R->getVPDefID() == VPRecipeBase::VPWidenIntrinsicSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenMemIntrinsicSC ||
R->getVPDefID() == VPRecipeBase::VPWidenSelectSC ||
R->getVPDefID() == VPRecipeBase::VPReductionSC ||
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
@@ -1617,6 +1618,29 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
/// True if the intrinsic may have side-effects.
bool MayHaveSideEffects;
+protected:
+ VPWidenIntrinsicRecipe(const unsigned char SC,
+ Intrinsic::ID VectorIntrinsicID,
+ ArrayRef<VPValue *> CallArguments, Type *Ty,
+ const VPIRFlags &Flags = {},
+ const VPIRMetadata &Metadata = {},
+ DebugLoc DL = DebugLoc::getUnknown())
+ : VPRecipeWithIRFlags(SC, CallArguments, Flags, DL),
+ VPIRMetadata(Metadata), VectorIntrinsicID(VectorIntrinsicID),
+ ResultTy(Ty) {
+ LLVMContext &Ctx = Ty->getContext();
+ AttributeSet Attrs = Intrinsic::getFnAttributes(Ctx, VectorIntrinsicID);
+ MemoryEffects ME = Attrs.getMemoryEffects();
+ MayReadFromMemory = !ME.onlyWritesMemory();
+ MayWriteToMemory = !ME.onlyReadsMemory();
+ MayHaveSideEffects = MayWriteToMemory ||
+ !Attrs.hasAttribute(Attribute::NoUnwind) ||
+ !Attrs.hasAttribute(Attribute::WillReturn);
+ }
+
+ /// Helper function to produce the widened intrinsic call.
+ CallInst *createVectorCall(VPTransformState &State);
+
public:
VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID,
ArrayRef<VPValue *> CallArguments, Type *Ty,
@@ -1637,19 +1661,8 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
const VPIRFlags &Flags = {},
const VPIRMetadata &Metadata = {},
DebugLoc DL = DebugLoc::getUnknown())
- : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, Flags,
- DL),
- VPIRMetadata(Metadata), VectorIntrinsicID(VectorIntrinsicID),
- ResultTy(Ty) {
- LLVMContext &Ctx = Ty->getContext();
- AttributeSet Attrs = Intrinsic::getFnAttributes(Ctx, VectorIntrinsicID);
- MemoryEffects ME = Attrs.getMemoryEffects();
- MayReadFromMemory = !ME.onlyWritesMemory();
- MayWriteToMemory = !ME.onlyReadsMemory();
- MayHaveSideEffects = MayWriteToMemory ||
- !Attrs.hasAttribute(Attribute::NoUnwind) ||
- !Attrs.hasAttribute(Attribute::WillReturn);
- }
+ : VPWidenIntrinsicRecipe(VPDef::VPWidenIntrinsicSC, VectorIntrinsicID,
+ CallArguments, Ty, Flags, Metadata, DL) {}
~VPWidenIntrinsicRecipe() override = default;
@@ -1662,7 +1675,24 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
*this, *this, getDebugLoc());
}
- VP_CLASSOF_IMPL(VPDef::VPWidenIntrinsicSC)
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPRecipeBase::VPWidenIntrinsicSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenMemIntrinsicSC;
+ }
+
+ static inline bool classof(const VPUser *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && classof(R);
+ }
+
+ static inline bool classof(const VPValue *V) {
+ auto *R = V->getDefiningRecipe();
+ return R && classof(R);
+ }
+
+ static inline bool classof(const VPSingleDefRecipe *R) {
+ return classof(static_cast<const VPRecipeBase *>(R));
+ }
/// Produce a widened version of the vector intrinsic.
void execute(VPTransformState &State) override;
@@ -1700,48 +1730,28 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
};
/// A recipe for widening vector memory intrinsics.
-class VPWidenMemIntrinsicRecipe : public VPRecipeBase, public VPIRMetadata {
- Instruction &Ingredient;
-
+class VPWidenMemIntrinsicRecipe final : public VPWidenIntrinsicRecipe {
/// Alignment information for this memory access.
Align Alignment;
- /// ID of the vector intrinsic to widen.
- Intrinsic::ID VectorIntrinsicID;
-
- /// Scalar return type of the intrinsic.
- Type *ResultTy;
-
- /// True if the intrinsic may read from memory.
- bool MayReadFromMemory;
-
- /// True if the intrinsic may read write to memory.
- bool MayWriteToMemory;
-
- /// True if the intrinsic may have side-effects.
- bool MayHaveSideEffects;
-
public:
// TODO: support StoreInst for strided store
VPWidenMemIntrinsicRecipe(LoadInst &LI, Intrinsic::ID VectorIntrinsicID,
ArrayRef<VPValue *> CallArguments,
const VPIRMetadata &MD = {},
DebugLoc DL = DebugLoc::getUnknown())
- : VPRecipeBase(VPDef::VPWidenMemIntrinsicSC, CallArguments, DL),
- VPIRMetadata(MD), Ingredient(LI), Alignment(LI.getAlign()),
- VectorIntrinsicID(VectorIntrinsicID), ResultTy(LI.getType()),
- MayReadFromMemory(LI.mayReadFromMemory()),
- MayWriteToMemory(LI.mayWriteToMemory()),
- MayHaveSideEffects(LI.mayHaveSideEffects()) {
- new VPValue(&LI, this);
+ : VPWidenIntrinsicRecipe(VPDef::VPWidenMemIntrinsicSC, VectorIntrinsicID,
+ CallArguments, LI.getType(), {}, MD, DL),
+ Alignment(LI.getAlign()) {
+ setUnderlyingValue(&LI);
}
~VPWidenMemIntrinsicRecipe() override = default;
VPWidenMemIntrinsicRecipe *clone() override {
- return new VPWidenMemIntrinsicRecipe(*cast<LoadInst>(&Ingredient),
- VectorIntrinsicID, operands(), *this,
- getDebugLoc());
+ return new VPWidenMemIntrinsicRecipe(*cast<LoadInst>(getUnderlyingInstr()),
+ getVectorIntrinsicID(), operands(),
+ *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenMemIntrinsicSC)
@@ -1753,42 +1763,15 @@ class VPWidenMemIntrinsicRecipe : public VPRecipeBase, public VPIRMetadata {
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override;
- /// Return the ID of the intrinsic.
- Intrinsic::ID getVectorIntrinsicID() const { return VectorIntrinsicID; }
-
- /// Return the scalar return type of the intrinsic.
- Type *getResultType() const { return ResultTy; }
-
- /// Return to name of the intrinsic as string.
- StringRef getIntrinsicName() const {
- return Intrinsic::getBaseName(VectorIntrinsicID);
- }
-
- /// Returns true if the intrinsic may read from memory.
- bool mayReadFromMemory() const { return MayReadFromMemory; }
-
- /// Returns true if the intrinsic may write to memory.
- bool mayWriteToMemory() const { return MayWriteToMemory; }
-
- /// Returns true if the intrinsic may have side-effects.
- bool mayHaveSideEffects() const { return MayHaveSideEffects; }
-
+ /// Return the index of pointer parameter.
unsigned getMemoryPointerParamPos() const;
+ /// Return the index of mask parameter.
unsigned getMaskParamPos() const;
void setMask(VPValue *Mask) { setOperand(getMaskParamPos(), Mask); }
VPValue *getMask() const { return getOperand(getMaskParamPos()); }
-
- bool usesFirstLaneOnly(const VPValue *Op) const override;
-
-protected:
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the recipe.
- void printRecipe(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-#endif
};
/// A recipe for widening Call instructions using library calls.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 2e149cdec7e16..f44b6ccf8c35a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -300,7 +300,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
})
// VPInstructionWithType must be handled before VPInstruction.
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
- VPWidenMemIntrinsicRecipe, VPWidenCastRecipe>(
+ VPWidenCastRecipe>(
[](const auto *R) { return R->getResultType(); })
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 29ee35f6c4441..33b6cb788670c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -70,10 +70,9 @@ bool VPRecipeBase::mayWriteToMemory() const {
return !cast<VPWidenCallRecipe>(this)
->getCalledScalarFunction()
->onlyReadsMemory();
+ case VPWidenMemIntrinsicSC:
case VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
- case VPWidenMemIntrinsicSC:
- return cast<VPWidenMemIntrinsicRecipe>(this)->mayWriteToMemory();
case VPCanonicalIVPHISC:
case VPBranchOnMaskSC:
case VPDerivedIVSC:
@@ -126,10 +125,9 @@ bool VPRecipeBase::mayReadFromMemory() const {
return !cast<VPWidenCallRecipe>(this)
->getCalledScalarFunction()
->onlyWritesMemory();
+ case VPWidenMemIntrinsicSC:
case VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
- case VPWidenMemIntrinsicSC:
- return cast<VPWidenMemIntrinsicRecipe>(this)->mayReadFromMemory();
case VPBranchOnMaskSC:
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
@@ -182,10 +180,9 @@ bool VPRecipeBase::mayHaveSideEffects() const {
Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
}
+ case VPWidenMemIntrinsicSC:
case VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
- case VPWidenMemIntrinsicSC:
- return cast<VPWidenMemIntrinsicRecipe>(this)->mayHaveSideEffects();
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
@@ -1699,9 +1696,7 @@ void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
}
#endif
-void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
- assert(State.VF.isVector() && "not widening");
-
+CallInst *VPWidenIntrinsicRecipe::createVectorCall(VPTransformState &State) {
SmallVector<Type *, 2> TysForDecl;
// Add return type if intrinsic is overloaded on it.
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
@@ -1729,7 +1724,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
assert(VectorF &&
"Can't retrieve vector intrinsic or vector-predication intrinsics.");
- auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
+ auto *CI = dyn_cast_or_null<CallInst>(getUnderlyingValue());
SmallVector<OperandBundleDef, 1> OpBundles;
if (CI)
CI->getOperandBundlesAsDefs(OpBundles);
@@ -1739,6 +1734,12 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
applyFlags(*V);
applyMetadata(*V);
+ return V;
+}
+
+void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
+ assert(State.VF.isVector() && "not widening");
+ CallInst *V = createVectorCall(State);
if (!V->getType()->isVoidTy())
State.set(this, V);
}
@@ -1829,10 +1830,11 @@ void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
#endif
unsigned VPWidenMemIntrinsicRecipe::getMemoryPointerParamPos() const {
- if (auto Pos = VPIntrinsic::getMemoryPointerParamPos(VectorIntrinsicID))
+ Intrinsic::ID IID = getVectorIntrinsicID();
+ if (auto Pos = VPIntrinsic::getMemoryPointerParamPos(IID))
return *Pos;
- switch (VectorIntrinsicID) {
+ switch (IID) {
case Intrinsic::masked_load:
case Intrinsic::masked_gather:
case Intrinsic::masked_expandload:
@@ -1847,10 +1849,11 @@ unsigned VPWidenMemIntrinsicRecipe::getMemoryPointerParamPos() const {
}
unsigned VPWidenMemIntrinsicRecipe::getMaskParamPos() const {
- if (auto Pos = VPIntrinsic::getMaskParamPos(VectorIntrinsicID))
+ Intrinsic::ID IID = getVectorIntrinsicID();
+ if (auto Pos = VPIntrinsic::getMaskParamPos(IID))
return *Pos;
- switch (VectorIntrinsicID) {
+ switch (IID) {
case Intrinsic::masked_load:
case Intrinsic::masked_gather:
case Intrinsic::masked_expandload:
@@ -1866,87 +1869,28 @@ unsigned VPWidenMemIntrinsicRecipe::getMaskParamPos() const {
void VPWidenMemIntrinsicRecipe::execute(VPTransformState &State) {
assert(State.VF.isVector() && "not widening");
-
- SmallVector<Type *, 2> TysForDecl;
- // Add return type if intrinsic is overloaded on it.
- if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
- TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
- SmallVector<Value *, 4> Args;
- for (const auto &I : enumerate(operands())) {
- // Some intrinsics have a scalar argument - don't replace it with a
- // vector.
- Value *Arg;
- if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
- State.TTI))
- Arg = State.get(I.value(), VPLane(0));
- else
- Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
- if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
- State.TTI))
- TysForDecl.push_back(Arg->getType());
- Args.push_back(Arg);
- }
-
- // Use vector version of the intrinsic.
- Module *M = State.Builder.GetInsertBlock()->getModule();
- Function *VectorF =
- Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
- assert(VectorF &&
- "Can't retrieve vector intrinsic or vector-predication intrinsics.");
-
- CallInst *MemI = State.Builder.CreateCall(VectorF, Args);
+ CallInst *MemI = createVectorCall(State);
MemI->addParamAttr(
getMemoryPointerParamPos(),
Attribute::getWithAlignment(MemI->getContext(), Alignment));
- applyMetadata(*MemI);
if (!MemI->getType()->isVoidTy())
- State.set(getVPSingleValue(), MemI);
+ State.set(this, MemI);
}
InstructionCost
VPWidenMemIntrinsicRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
- const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+ const Instruction *Ingredient = getUnderlyingInstr();
+ Type *Ty = toVectorTy(getLoadStoreType(Ingredient), VF);
+ const Value *Ptr = getLoadStorePointerOperand(Ingredient);
return Ctx.TTI.getMemIntrinsicInstrCost(
- MemIntrinsicCostAttributes(VectorIntrinsicID, Ty, Ptr,
+ MemIntrinsicCostAttributes(getVectorIntrinsicID(), Ty, Ptr,
match(getMask(), m_True()), Alignment,
- &Ingredient),
+ Ingredient),
Ctx.CostKind);
}
-// Copy from VPWidenIntrinsicRecipe::usesFirstLaneOnly.
-bool VPWidenMemIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
- assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
- return all_of(enumerate(operands()), [this, &Op](const auto &X) {
- auto [Idx, V] = X;
- return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
- Idx, nullptr);
- });
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenMemIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN-MEM-INTRINSIC ";
- if (ResultTy->isVoidTy()) {
- O << "void ";
- } else {
- getVPSingleValue()->printAsOperand(O, SlotTracker);
- O << " = ";
- }
-
- O << "call ";
- O << getIntrinsicName() << "(";
-
- interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
- Op->printAsOperand(O, SlotTracker);
- });
- O << ")";
-}
-#endif
-
void VPHistogramRecipe::execute(VPTransformState &State) {
IRBuilderBase &Builder = State.Builder;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index a1224ee5081c8..fd120fe571e35 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -51,7 +51,6 @@ class LLVM_ABI_FOR_TEST VPValue {
friend class VPInterleaveBase;
friend class VPlan;
friend class VPExpressionRecipe;
- friend class VPWidenMemIntrinsicRecipe;
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7f1ca468e5ac2..26c3c63de44ab 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -158,10 +158,9 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
};
return all_of(EVL.users(), [this, &VerifyEVLUse](VPUser *U) {
return TypeSwitch<const VPUser *, bool>(U)
- .Case<VPWidenIntrinsicRecipe, VPWidenMemIntrinsicRecipe>(
- [&](const VPRecipeBase *R) {
- return VerifyEVLUse(*R, R->getNumOperands() - 1);
- })
+ .Case<VPWidenIntrinsicRecipe>([&](const VPRecipeBase *R) {
+ return VerifyEVLUse(*R, R->getNumOperands() - 1);
+ })
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe,
VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe,
VPWidenStridedLoadRecipe>(
>From 5b5ef06b22d0dc9543a085bf821333ea2bfc7a34 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 8 Dec 2025 18:20:05 -0800
Subject: [PATCH 19/25] narrow to Intrinsic::experimental_vp_strided_load
---
llvm/lib/Transforms/Vectorize/VPlan.h | 19 +++----
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 51 ++-----------------
.../Transforms/Vectorize/VPlanTransforms.cpp | 6 +--
3 files changed, 14 insertions(+), 62 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d3e6e4e25273b..67b3926986c79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1736,11 +1736,11 @@ class VPWidenMemIntrinsicRecipe final : public VPWidenIntrinsicRecipe {
public:
// TODO: support StoreInst for strided store
- VPWidenMemIntrinsicRecipe(LoadInst &LI, Intrinsic::ID VectorIntrinsicID,
- ArrayRef<VPValue *> CallArguments,
+ VPWidenMemIntrinsicRecipe(LoadInst &LI, ArrayRef<VPValue *> CallArguments,
const VPIRMetadata &MD = {},
DebugLoc DL = DebugLoc::getUnknown())
- : VPWidenIntrinsicRecipe(VPDef::VPWidenMemIntrinsicSC, VectorIntrinsicID,
+ : VPWidenIntrinsicRecipe(VPDef::VPWidenMemIntrinsicSC,
+ Intrinsic::experimental_vp_strided_load,
CallArguments, LI.getType(), {}, MD, DL),
Alignment(LI.getAlign()) {
setUnderlyingValue(&LI);
@@ -1750,8 +1750,7 @@ class VPWidenMemIntrinsicRecipe final : public VPWidenIntrinsicRecipe {
VPWidenMemIntrinsicRecipe *clone() override {
return new VPWidenMemIntrinsicRecipe(*cast<LoadInst>(getUnderlyingInstr()),
- getVectorIntrinsicID(), operands(),
- *this, getDebugLoc());
+ operands(), *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenMemIntrinsicSC)
@@ -1763,15 +1762,9 @@ class VPWidenMemIntrinsicRecipe final : public VPWidenIntrinsicRecipe {
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override;
- /// Return the index of pointer parameter.
- unsigned getMemoryPointerParamPos() const;
+ void setMask(VPValue *Mask) { setOperand(2, Mask); }
- /// Return the index of mask parameter.
- unsigned getMaskParamPos() const;
-
- void setMask(VPValue *Mask) { setOperand(getMaskParamPos(), Mask); }
-
- VPValue *getMask() const { return getOperand(getMaskParamPos()); }
+ VPValue *getMask() const { return getOperand(2); }
};
/// A recipe for widening Call instructions using library calls.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 33b6cb788670c..5366c81525ebd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1697,6 +1697,8 @@ void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
#endif
CallInst *VPWidenIntrinsicRecipe::createVectorCall(VPTransformState &State) {
+ assert(State.VF.isVector() && "not widening");
+
SmallVector<Type *, 2> TysForDecl;
// Add return type if intrinsic is overloaded on it.
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
@@ -1738,7 +1740,6 @@ CallInst *VPWidenIntrinsicRecipe::createVectorCall(VPTransformState &State) {
}
void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
- assert(State.VF.isVector() && "not widening");
CallInst *V = createVectorCall(State);
if (!V->getType()->isVoidTy())
State.set(this, V);
@@ -1829,60 +1830,18 @@ void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
}
#endif
-unsigned VPWidenMemIntrinsicRecipe::getMemoryPointerParamPos() const {
- Intrinsic::ID IID = getVectorIntrinsicID();
- if (auto Pos = VPIntrinsic::getMemoryPointerParamPos(IID))
- return *Pos;
-
- switch (IID) {
- case Intrinsic::masked_load:
- case Intrinsic::masked_gather:
- case Intrinsic::masked_expandload:
- return 0;
- case Intrinsic::masked_store:
- case Intrinsic::masked_scatter:
- case Intrinsic::masked_compressstore:
- return 1;
- default:
- llvm_unreachable("unknown vector memory intrinsic");
- }
-}
-
-unsigned VPWidenMemIntrinsicRecipe::getMaskParamPos() const {
- Intrinsic::ID IID = getVectorIntrinsicID();
- if (auto Pos = VPIntrinsic::getMaskParamPos(IID))
- return *Pos;
-
- switch (IID) {
- case Intrinsic::masked_load:
- case Intrinsic::masked_gather:
- case Intrinsic::masked_expandload:
- return 1;
- case Intrinsic::masked_store:
- case Intrinsic::masked_scatter:
- case Intrinsic::masked_compressstore:
- return 2;
- default:
- llvm_unreachable("unknown vector memory intrinsic");
- }
-}
-
void VPWidenMemIntrinsicRecipe::execute(VPTransformState &State) {
- assert(State.VF.isVector() && "not widening");
CallInst *MemI = createVectorCall(State);
MemI->addParamAttr(
- getMemoryPointerParamPos(),
- Attribute::getWithAlignment(MemI->getContext(), Alignment));
-
- if (!MemI->getType()->isVoidTy())
- State.set(this, MemI);
+ 0, Attribute::getWithAlignment(MemI->getContext(), Alignment));
+ State.set(this, MemI);
}
InstructionCost
VPWidenMemIntrinsicRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
+ Type *Ty = toVectorTy(getResultType(), VF);
const Instruction *Ingredient = getUnderlyingInstr();
- Type *Ty = toVectorTy(getLoadStoreType(Ingredient), VF);
const Value *Ptr = getLoadStorePointerOperand(Ingredient);
return Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(getVectorIntrinsicID(), Ty, Ptr,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index fb51cbf915a63..6cd10755d739b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5312,10 +5312,10 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
else
Mask = Plan.getTrue();
auto *StridedLoad = new VPWidenMemIntrinsicRecipe(
- *cast<LoadInst>(&Ingredient), Intrinsic::experimental_vp_strided_load,
- {NewPtr, StrideInBytes, Mask, I32VF}, *LoadR, LoadR->getDebugLoc());
+ *cast<LoadInst>(&Ingredient), {NewPtr, StrideInBytes, Mask, I32VF},
+ *LoadR, LoadR->getDebugLoc());
StridedLoad->insertBefore(LoadR);
- LoadR->replaceAllUsesWith(StridedLoad->getVPSingleValue());
+ LoadR->replaceAllUsesWith(StridedLoad);
ToErase.push_back(LoadR);
}
>From 61c1774644d531669ea7ced80c0ff7d2d980158b Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 8 Dec 2025 19:35:12 -0800
Subject: [PATCH 20/25] pattern matching
Intrinsic::experimental_vp_strided_load
---
llvm/lib/Transforms/Vectorize/VPlan.h | 4 ----
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 24 ++++++++++---------
3 files changed, 14 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 67b3926986c79..8df41c5ce1fd6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1761,10 +1761,6 @@ class VPWidenMemIntrinsicRecipe final : public VPWidenIntrinsicRecipe {
/// Return the cost of this vector memory intrinsic.
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override;
-
- void setMask(VPValue *Mask) { setOperand(2, Mask); }
-
- VPValue *getMask() const { return getOperand(2); }
};
/// A recipe for widening Call instructions using library calls.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5366c81525ebd..78b8a00657a2b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1845,7 +1845,7 @@ VPWidenMemIntrinsicRecipe::computeCost(ElementCount VF,
const Value *Ptr = getLoadStorePointerOperand(Ingredient);
return Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(getVectorIntrinsicID(), Ty, Ptr,
- match(getMask(), m_True()), Alignment,
+ match(getOperand(2), m_True()), Alignment,
Ingredient),
Ctx.CostKind);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6cd10755d739b..c5d41435f3e06 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2797,17 +2797,19 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
AdjustEndPtr(EndPtr), EVL, Mask);
- if (auto *MI = dyn_cast<VPWidenMemIntrinsicRecipe>(&CurRecipe))
- if (MI->getVectorIntrinsicID() == Intrinsic::experimental_vp_strided_load &&
- match(MI->getMask(), m_RemoveMask(HeaderMask, Mask))) {
- VPWidenMemIntrinsicRecipe *NewMI = MI->clone();
- if (Mask)
- NewMI->setMask(Mask);
- else
- NewMI->setMask(Plan->getTrue());
- NewMI->setOperand(3, &EVL);
- return NewMI;
- }
+ VPValue *Stride;
+ if (match(&CurRecipe, m_Intrinsic<Intrinsic::experimental_vp_strided_load>(
+ m_VPValue(Addr), m_VPValue(Stride),
+ m_RemoveMask(HeaderMask, Mask),
+ m_VPInstruction<Instruction::Trunc>(
+ m_Specific(&Plan->getVF()))))) {
+ auto *I = cast<VPWidenIntrinsicRecipe>(&CurRecipe);
+ if (!Mask)
+ Mask = Plan->getTrue();
+ return new VPWidenMemIntrinsicRecipe(
+ *cast<LoadInst>(I->getUnderlyingInstr()), {Addr, Stride, Mask, &EVL},
+ *I, I->getDebugLoc());
+ }
if (auto *StridedL = dyn_cast<VPWidenStridedLoadRecipe>(&CurRecipe))
if (StridedL->isMasked() &&
>From 015eccf7f4e0f695bffd935e2429355eb606c9ca Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 8 Dec 2025 22:57:25 -0800
Subject: [PATCH 21/25] assert scalable and vector indexed type
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c5d41435f3e06..273fd1f3dceaf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5198,8 +5198,9 @@ determineBaseAndStride(VPWidenGEPRecipe *WidenGEP) {
return {nullptr, nullptr, nullptr};
Type *ElementTy = WidenGEP->getIndexedType(*VarIndex);
- if (ElementTy->isScalableTy() || ElementTy->isStructTy() ||
- ElementTy->isVectorTy())
+ assert(!ElementTy->isScalableTy() && !ElementTy->isVectorTy() &&
+ "Unexpected indexed type");
+ if (ElementTy->isStructTy())
return {nullptr, nullptr, nullptr};
unsigned VarOp = *VarIndex + 1;
>From e702e8cea7bc6089df325d43c02f9eccb76350aa Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 8 Dec 2025 23:09:23 -0800
Subject: [PATCH 22/25] remove unused VPWidenStridedLoadRecipe
---
.../Transforms/Vectorize/LoopVectorize.cpp | 5 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 55 +------------------
.../Transforms/Vectorize/VPlanAnalysis.cpp | 6 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 54 ------------------
.../Transforms/Vectorize/VPlanTransforms.cpp | 29 +++-------
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 -
.../Transforms/Vectorize/VPlanVerifier.cpp | 3 +-
7 files changed, 15 insertions(+), 138 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2d8588e8d062a..0815d24496b10 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4021,7 +4021,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
[](const auto *R) { return Instruction::Select; })
.Case<VPWidenStoreRecipe>(
[](const auto *R) { return Instruction::Store; })
- .Case<VPWidenLoadRecipe, VPWidenStridedLoadRecipe>(
+ .Case<VPWidenLoadRecipe>(
[](const auto *R) { return Instruction::Load; })
.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
[](const auto *R) { return Instruction::Call; })
@@ -4125,7 +4125,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPReductionPHISC:
case VPDef::VPInterleaveEVLSC:
case VPDef::VPInterleaveSC:
- case VPDef::VPWidenStridedLoadSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
case VPDef::VPWidenStoreEVLSC:
@@ -7095,7 +7094,7 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
// The strided load is transformed from a gather through VPlanTransform,
// and its cost will be lower than the original gather.
- if (isa<VPWidenStridedLoadRecipe>(&R))
+ if (isa<VPWidenMemIntrinsicRecipe>(&R))
return true;
if (Instruction *UI = GetInstructionForCost(&R)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8df41c5ce1fd6..50108f656f913 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -573,7 +573,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInterleaveEVLSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
- case VPRecipeBase::VPWidenStridedLoadSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
case VPRecipeBase::VPWidenStoreEVLSC:
@@ -3378,8 +3377,7 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
- R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC ||
- R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC;
+ R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
}
static inline bool classof(const VPUser *U) {
@@ -3505,57 +3503,6 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
#endif
};
-/// A recipe for strided load operations, using the base address, stride, VF,
-/// and an optional mask. This recipe will generate a vp.strided.load intrinsic
-/// call to represent memory accesses with a fixed stride.
-struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
- public VPValue {
- VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride,
- VPValue *VF, VPValue *Mask,
- const VPIRMetadata &Metadata, DebugLoc DL)
- : VPWidenMemoryRecipe(
- VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF},
- /*Consecutive=*/false, /*Reverse=*/false, Metadata, DL),
- VPValue(this, &Load) {
- setMask(Mask);
- }
-
- VPWidenStridedLoadRecipe *clone() override {
- return new VPWidenStridedLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getStride(), getVF(), getMask(), *this,
- getDebugLoc());
- }
-
- VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC);
-
- /// Return the stride operand.
- VPValue *getStride() const { return getOperand(1); }
-
- /// Return the VF operand.
- VPValue *getVF() const { return getOperand(2); }
-
- /// Generate a strided load.
- void execute(VPTransformState &State) override;
-
- /// Return the cost of this VPWidenStridedLoadRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the recipe.
- void printRecipe(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-#endif
-
- /// Returns true if the recipe only uses the first lane of operand \p Op.
- bool usesFirstLaneOnly(const VPValue *Op) const override {
- assert(is_contained(operands(), Op) &&
- "Op must be an operand of the recipe");
- // All operands except the mask are only used for the first lane.
- return Op == getAddr() || Op == getStride() || Op == getVF();
- }
-};
-
/// A recipe for widening store operations, using the stored value, the address
/// to store to and an optional mask.
struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f44b6ccf8c35a..c64b97579881a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -194,10 +194,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
- assert(
- (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
- R)) &&
- "Store recipes should not define any values");
+ assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
+ "Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 78b8a00657a2b..3c406ec2298b2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -89,7 +89,6 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenCastSC:
case VPWidenGEPSC:
case VPWidenIntOrFpInductionSC:
- case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenPHISC:
@@ -114,7 +113,6 @@ bool VPRecipeBase::mayReadFromMemory() const {
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
- case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
return true;
@@ -206,7 +204,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPInterleaveEVLSC:
case VPInterleaveSC:
return mayWriteToMemory();
- case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenStoreEVLSC:
@@ -3686,57 +3683,6 @@ void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
}
#endif
-void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
- Type *ScalarDataTy = getLoadStoreType(&Ingredient);
- auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
-
- auto &Builder = State.Builder;
- Value *Addr = State.get(getAddr(), /*IsScalar*/ true);
- Value *StrideInBytes = State.get(getStride(), /*IsScalar*/ true);
- Value *Mask = nullptr;
- if (VPValue *VPMask = getMask())
- Mask = State.get(VPMask);
- else
- Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
- Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)),
- Builder.getInt32Ty());
-
- auto *PtrTy = Addr->getType();
- auto *StrideTy = StrideInBytes->getType();
- CallInst *NewLI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
- {Addr, StrideInBytes, Mask, RunTimeVF}, nullptr, "wide.strided.load");
- NewLI->addParamAttr(
- 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
- applyMetadata(*NewLI);
- State.set(this, NewLI);
-}
-
-InstructionCost
-VPWidenStridedLoadRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
- Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
- const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
- return Ctx.TTI.getMemIntrinsicInstrCost(
- MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load, Ty,
- Ptr, IsMasked, Alignment, &Ingredient),
- Ctx.CostKind);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenStridedLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN ";
- printAsOperand(O, SlotTracker);
- O << " = load ";
- getAddr()->printAsOperand(O, SlotTracker);
- O << ", stride = ";
- getStride()->printAsOperand(O, SlotTracker);
- O << ", runtimeVF = ";
- getVF()->printAsOperand(O, SlotTracker);
-}
-#endif
-
void VPWidenStoreRecipe::execute(VPTransformState &State) {
VPValue *StoredVPValue = getStoredValue();
bool CreateScatter = !isConsecutive();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 273fd1f3dceaf..de4478386e578 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2811,14 +2811,6 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
*I, I->getDebugLoc());
}
- if (auto *StridedL = dyn_cast<VPWidenStridedLoadRecipe>(&CurRecipe))
- if (StridedL->isMasked() &&
- match(StridedL->getMask(), m_RemoveMask(HeaderMask, Mask)))
- return new VPWidenStridedLoadRecipe(
- *cast<LoadInst>(&StridedL->getIngredient()), StridedL->getAddr(),
- StridedL->getStride(), &EVL, Mask, *StridedL,
- StridedL->getDebugLoc());
-
if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
m_RemoveMask(HeaderMask, Mask))) &&
!cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
@@ -2872,17 +2864,14 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
- assert(
- all_of(
- Plan.getVF().users(),
- [&LoopRegion](VPUser *U) {
- auto *R = cast<VPRecipeBase>(U);
- return (R->getParent()->getParent() != LoopRegion) ||
- isa<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
- VPWidenIntOrFpInductionRecipe, VPWidenStridedLoadRecipe>(
- R);
- }) &&
- "User of VF that we can't transform to EVL.");
+ assert(all_of(Plan.getVF().users(),
+ [&LoopRegion](VPUser *U) {
+ auto *R = cast<VPRecipeBase>(U);
+ return (R->getParent()->getParent() != LoopRegion) ||
+ isa<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
+ VPWidenIntOrFpInductionRecipe>(R);
+ }) &&
+ "User of VF that we can't transform to EVL.");
Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
});
@@ -2980,7 +2969,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
"original.");
EVLRecipe->insertBefore(CurRecipe);
if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPWidenMemIntrinsicRecipe,
- VPWidenStridedLoadRecipe, VPInterleaveEVLRecipe>(EVLRecipe)) {
+ VPInterleaveEVLRecipe>(EVLRecipe)) {
for (unsigned I = 0; I < NumDefVal; ++I) {
VPValue *CurVPV = CurRecipe->getVPValue(I);
CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index fd120fe571e35..442fe10eeb231 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -359,7 +359,6 @@ class VPDef {
VPWidenGEPSC,
VPWidenIntrinsicSC,
VPWidenMemIntrinsicSC,
- VPWidenStridedLoadSC,
VPWidenLoadEVLSC,
VPWidenLoadSC,
VPWidenStoreEVLSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 26c3c63de44ab..af70f7ab8a0ff 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -162,8 +162,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
return VerifyEVLUse(*R, R->getNumOperands() - 1);
})
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe,
- VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe,
- VPWidenStridedLoadRecipe>(
+ VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(
[&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
.Case<VPScalarIVStepsRecipe>([&](auto *R) {
if (R->getNumOperands() != 3) {
>From e25dc22919a7991c6dc7e86439f0c6e4a1fe1f0a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 8 Dec 2025 23:59:31 -0800
Subject: [PATCH 23/25] inline getUniqueVariantIndex()
---
llvm/lib/Transforms/Vectorize/VPlan.h | 17 ++---------------
.../Transforms/Vectorize/VPlanTransforms.cpp | 16 ++++++++++++----
2 files changed, 14 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 50108f656f913..f416891a380de 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1913,10 +1913,6 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
Type *SourceElementTy;
- bool isIndexLoopInvariant(unsigned I) const {
- return getOperand(I + 1)->isDefinedOutsideLoopRegions();
- }
-
public:
VPWidenGEPRecipe(GetElementPtrInst *GEP, ArrayRef<VPValue *> Operands,
const VPIRFlags &Flags = {},
@@ -1946,17 +1942,8 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
return getOperand(0)->isDefinedOutsideLoopRegions();
}
- std::optional<unsigned> getUniqueVariantIndex() const {
- std::optional<unsigned> VarIdx;
- for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
- if (isIndexLoopInvariant(I))
- continue;
-
- if (VarIdx)
- return std::nullopt;
- VarIdx = I;
- }
- return VarIdx;
+ bool isIndexLoopInvariant(unsigned I) const {
+ return getOperand(I + 1)->isDefinedOutsideLoopRegions();
}
/// Returns the element type for the first \p I indices of this recipe.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index de4478386e578..b11a4212afb80 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5182,17 +5182,25 @@ determineBaseAndStride(VPWidenGEPRecipe *WidenGEP) {
return {nullptr, nullptr, nullptr};
// Find the only one variant index.
- std::optional<unsigned> VarIndex = WidenGEP->getUniqueVariantIndex();
- if (!VarIndex)
+ unsigned VarOp = 0;
+ for (unsigned I = 1, E = WidenGEP->getNumOperands(); I < E; ++I) {
+ if (WidenGEP->isIndexLoopInvariant(I - 1))
+ continue;
+
+ if (VarOp != 0)
+ return {nullptr, nullptr, nullptr};
+ VarOp = I;
+ }
+
+ if (!VarOp)
return {nullptr, nullptr, nullptr};
- Type *ElementTy = WidenGEP->getIndexedType(*VarIndex);
+ Type *ElementTy = WidenGEP->getIndexedType(VarOp - 1);
assert(!ElementTy->isScalableTy() && !ElementTy->isVectorTy() &&
"Unexpected indexed type");
if (ElementTy->isStructTy())
return {nullptr, nullptr, nullptr};
- unsigned VarOp = *VarIndex + 1;
VPValue *IndexVPV = WidenGEP->getOperand(VarOp);
auto [Start, Stride] = matchStridedStart(IndexVPV);
if (!Start)
>From 2a0fe35984703b75388c038320f8cbfc8f7c61e8 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 9 Dec 2025 01:00:32 -0800
Subject: [PATCH 24/25] use createOverflowingOp to generate VPInstruction
recipe
---
.../Vectorize/LoopVectorizationPlanner.h | 14 ++++++++++++++
.../Transforms/Vectorize/VPlanTransforms.cpp | 17 +++++++----------
2 files changed, 21 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 741392247c0d6..299756507a910 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -346,6 +346,20 @@ class VPBuilder {
return tryInsertInstruction(new VPExpandSCEVRecipe(Expr));
}
+ VPVectorPointerRecipe *
+ createVectorPointer(VPValue *Ptr, Type *SourceElementTy, VPValue *Stride,
+ GEPNoWrapFlags GEPFlags, DebugLoc DL) {
+ return tryInsertInstruction(
+ new VPVectorPointerRecipe(Ptr, SourceElementTy, Stride, GEPFlags, DL));
+ }
+
+ VPWidenMemIntrinsicRecipe *
+ createWidenMemIntrinsic(LoadInst &LI, ArrayRef<VPValue *> CallArguments,
+ const VPIRMetadata &MD, DebugLoc DL) {
+ return tryInsertInstruction(
+ new VPWidenMemIntrinsicRecipe(LI, CallArguments, MD, DL));
+ }
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b11a4212afb80..fc0b604f31a10 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5164,10 +5164,10 @@ static std::pair<VPValue *, VPValue *> matchStridedStart(VPValue *CurIndex) {
/*IsUniform*/ true);
StartR->insertBefore(WidenR);
+ VPBuilder Builder(WidenR);
unsigned InvIdx = VarIdx == 0 ? 1 : 0;
auto *StrideR =
- new VPInstruction(Opcode, {Stride, WidenR->getOperand(InvIdx)});
- StrideR->insertBefore(WidenR);
+ Builder.createOverflowingOp(Opcode, {Stride, WidenR->getOperand(InvIdx)});
return {StartR, StrideR};
}
@@ -5286,11 +5286,11 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
TypeInfo.inferScalarType(&Plan.getVF()), DebugLoc::getUnknown());
}
+ VPBuilder Builder(LoadR);
// Create a new vector pointer for strided access.
- auto *NewPtr = new VPVectorPointerRecipe(
+ auto *NewPtr = Builder.createVectorPointer(
BasePtr, ElementTy, StrideInElement, Ptr->getGEPNoWrapFlags(),
Ptr->getDebugLoc());
- NewPtr->insertBefore(LoadR);
const DataLayout &DL = Ingredient.getDataLayout();
TypeSize TS = DL.getTypeAllocSize(ElementTy);
@@ -5300,10 +5300,8 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
if (TypeScale != 1) {
VPValue *ScaleVPV = Plan.getConstantInt(
TypeInfo.inferScalarType(StrideInElement), TypeScale);
- auto *ScaledStride =
- new VPInstruction(Instruction::Mul, {StrideInElement, ScaleVPV});
- ScaledStride->insertBefore(LoadR);
- StrideInBytes = ScaledStride;
+ StrideInBytes = Builder.createOverflowingOp(
+ Instruction::Mul, {StrideInElement, ScaleVPV});
}
VPValue *Mask;
@@ -5311,10 +5309,9 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
Mask = LoadMask;
else
Mask = Plan.getTrue();
- auto *StridedLoad = new VPWidenMemIntrinsicRecipe(
+ auto *StridedLoad = Builder.createWidenMemIntrinsic(
*cast<LoadInst>(&Ingredient), {NewPtr, StrideInBytes, Mask, I32VF},
*LoadR, LoadR->getDebugLoc());
- StridedLoad->insertBefore(LoadR);
LoadR->replaceAllUsesWith(StridedLoad);
ToErase.push_back(LoadR);
>From 84abde2aa89d365782ba2df9a5040e6d2935dcc4 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 9 Dec 2025 01:41:47 -0800
Subject: [PATCH 25/25] Replace VPReplicateRecipe with VPInstruction
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index fc0b604f31a10..ba08c5d4281e9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5158,13 +5158,13 @@ static std::pair<VPValue *, VPValue *> matchStridedStart(VPValue *CurIndex) {
if (!Start)
return {nullptr, nullptr};
+ VPBuilder Builder(WidenR);
SmallVector<VPValue *> StartOps(WidenR->operands());
StartOps[VarIdx] = Start;
- auto *StartR = new VPReplicateRecipe(WidenR->getUnderlyingInstr(), StartOps,
- /*IsUniform*/ true);
- StartR->insertBefore(WidenR);
-
- VPBuilder Builder(WidenR);
+ auto *StartR = Builder.createOverflowingOp(
+ Opcode, StartOps,
+ {WidenR->hasNoUnsignedWrap(), WidenR->hasNoSignedWrap()},
+ WidenR->getDebugLoc());
unsigned InvIdx = VarIdx == 0 ? 1 : 0;
auto *StrideR =
Builder.createOverflowingOp(Opcode, {Stride, WidenR->getOperand(InvIdx)});
More information about the llvm-commits
mailing list