[llvm] [LV][EVL] Support interleaved access with tail folding by EVL (PR #152070)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 25 01:12:29 PDT 2025
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/152070
>From feb1f99b081b2e60047ba5f9aa9c8288f55eb63a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 21 Jul 2025 08:06:02 -0700
Subject: [PATCH 01/13] Support EVL interleave access
---
.../Transforms/Vectorize/LoopVectorize.cpp | 4 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 170 +++++++++++---
.../Transforms/Vectorize/VPlanAnalysis.cpp | 2 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 168 +++++++++++++-
.../Transforms/Vectorize/VPlanTransforms.cpp | 17 +-
llvm/lib/Transforms/Vectorize/VPlanValue.h | 5 +-
.../Transforms/Vectorize/VPlanVerifier.cpp | 3 +-
.../LoopVectorize/RISCV/dead-ops-cost.ll | 7 +-
.../RISCV/interleaved-accesses.ll | 208 ++++--------------
.../RISCV/interleaved-masked-access.ll | 98 ++++-----
.../RISCV/tail-folding-interleave.ll | 22 +-
11 files changed, 413 insertions(+), 291 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a0f306c12754f..366ffaec6c1c3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4171,6 +4171,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenIntOrFpInductionSC:
case VPDef::VPWidenPointerInductionSC:
case VPDef::VPReductionPHISC:
+ case VPDef::VPInterleaveEVLSC:
case VPDef::VPInterleaveSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
@@ -4199,8 +4200,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
// If no def nor is a store, e.g., branches, continue - no value to check.
if (R.getNumDefinedValues() == 0 &&
- !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
- &R))
+ !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(&R))
continue;
// For multi-def recipes, currently only interleaved loads, suffice to
// check first def only.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d6bc462a0dfab..1f54fd61a12c3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -557,6 +557,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPPartialReductionSC:
return true;
case VPRecipeBase::VPBranchOnMaskSC:
+ case VPRecipeBase::VPInterleaveEVLSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
case VPRecipeBase::VPWidenLoadEVLSC:
@@ -2434,12 +2435,14 @@ class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe {
}
};
-/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
-/// or stores into one wide load/store and shuffles. The first operand of a
-/// VPInterleave recipe is the address, followed by the stored values, followed
-/// by an optional mask.
-class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase,
- public VPIRMetadata {
+/// A common base class for interleaved memory operations.
+/// Interleaved memory operation is a memory access method that combines
+/// multiple strided loads/stores into a single wide load/store with shuffles.
+/// The first operand must be the address. The optional operands are, in order,
+/// the stored values and the mask.
+class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase,
+ public VPIRMetadata {
+protected:
const InterleaveGroup<Instruction> *IG;
/// Indicates if the interleave group is in a conditional block and requires a
@@ -2450,12 +2453,13 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase,
/// unusued gaps can be loaded speculatively.
bool NeedsMaskForGaps = false;
-public:
- VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
- ArrayRef<VPValue *> StoredValues, VPValue *Mask,
- bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
- : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}, DL), VPIRMetadata(MD),
- IG(IG), NeedsMaskForGaps(NeedsMaskForGaps) {
+ VPInterleaveBase(const unsigned char SC,
+ const InterleaveGroup<Instruction> *IG,
+ ArrayRef<VPValue *> Operands,
+ ArrayRef<VPValue *> StoredValues, VPValue *Mask,
+ bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
+ : VPRecipeBase(SC, Operands, DL), VPIRMetadata(MD), IG(IG),
+ NeedsMaskForGaps(NeedsMaskForGaps) {
// TODO: extend the masked interleaved-group support to reversed access.
assert((!Mask || !IG->isReverse()) &&
"Reversed masked interleave-group not supported.");
@@ -2473,14 +2477,21 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase,
addOperand(Mask);
}
}
- ~VPInterleaveRecipe() override = default;
- VPInterleaveRecipe *clone() override {
- return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(),
- NeedsMaskForGaps, *this, getDebugLoc());
+public:
+ VPInterleaveBase *clone() override {
+ llvm_unreachable("cloning not supported");
}
- VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPRecipeBase::VPInterleaveSC ||
+ R->getVPDefID() == VPRecipeBase::VPInterleaveEVLSC;
+ }
+
+ static inline bool classof(const VPUser *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && classof(R);
+ }
/// Return the address accessed by this recipe.
VPValue *getAddr() const {
@@ -2490,48 +2501,139 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase,
/// Return the mask used by this recipe. Note that a full mask is represented
/// by a nullptr.
VPValue *getMask() const {
- // Mask is optional and therefore the last, currently 2nd operand.
+ // Mask is optional and the last operand.
return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
}
+ /// Return true if the access needs a mask because of the gaps.
+ bool needsMaskForGaps() const { return NeedsMaskForGaps; }
+
+ const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
+
+ Instruction *getInsertPos() const { return IG->getInsertPos(); }
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPInterleaveBase should not be instantiated.");
+ }
+
+ /// Return the cost of this recipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ virtual bool onlyFirstLaneUsed(const VPValue *Op) const = 0;
+
+ /// Returns the number of stored operands of this interleave group. Returns 0
+ /// for load interleave groups.
+ virtual unsigned getNumStoreOperands() const = 0;
+
/// Return the VPValues stored by this interleave group. If it is a load
/// interleave group, return an empty ArrayRef.
- ArrayRef<VPValue *> getStoredValues() const {
- // The first operand is the address, followed by the stored values, followed
- // by an optional mask.
- return ArrayRef<VPValue *>(op_begin(), getNumOperands())
- .slice(1, getNumStoreOperands());
+ virtual ArrayRef<VPValue *> getStoredValues() const = 0;
+};
+
+/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
+/// or stores into one wide load/store and shuffles. The first operand of a
+/// VPInterleave recipe is the address, followed by the stored values, followed
+/// by an optional mask.
+class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase {
+public:
+ VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
+ ArrayRef<VPValue *> StoredValues, VPValue *Mask,
+ bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
+ : VPInterleaveBase(VPDef::VPInterleaveSC, IG, ArrayRef<VPValue *>({Addr}),
+ StoredValues, Mask, NeedsMaskForGaps, MD, DL) {}
+
+ ~VPInterleaveRecipe() override = default;
+
+ VPInterleaveRecipe *clone() override {
+ return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(),
+ NeedsMaskForGaps, *this, getDebugLoc());
}
+ VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
+
/// Generate the wide load or store, and shuffles.
void execute(VPTransformState &State) override;
- /// Return the cost of this VPInterleaveRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
- const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
+ }
- /// Returns the number of stored operands of this interleave group. Returns 0
- /// for load interleave groups.
- unsigned getNumStoreOperands() const {
+ unsigned getNumStoreOperands() const override {
return getNumOperands() - (HasMask ? 2 : 1);
}
- /// The recipe only uses the first lane of the address.
+ ArrayRef<VPValue *> getStoredValues() const override {
+ // The first operand is the address, followed by the stored values, followed
+ // by an optional mask.
+ return ArrayRef<VPValue *>(op_begin(), getNumOperands())
+ .slice(1, getNumStoreOperands());
+ }
+};
+
+/// A recipe for interleaved access operations with vector-predication
+/// intrinsics. The first operand is the address, the second operand is the
+/// explicit vector length . Stored values and mask are optional operands.
+class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase {
+public:
+ VPInterleaveEVLRecipe(VPInterleaveRecipe &R, VPValue &EVL, VPValue *Mask)
+ : VPInterleaveBase(VPDef::VPInterleaveEVLSC, R.getInterleaveGroup(),
+ ArrayRef<VPValue *>({R.getAddr(), &EVL}),
+ R.getStoredValues(), Mask, R.needsMaskForGaps(), R,
+ R.getDebugLoc()) {
+ assert(!IG->isReverse() &&
+ "Reversed interleave-group with tail folding is not supported.");
+ assert(!needsMaskForGaps() && "Interleaved access with gap mask is not "
+ "supported for scalable vector.");
+ }
+
+ ~VPInterleaveEVLRecipe() override = default;
+
+ VPInterleaveEVLRecipe *clone() override {
+ llvm_unreachable("cloning not implemented yet");
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPInterleaveEVLSC)
+
+ /// The VPValue of the explicit vector length.
+ VPValue *getEVL() const { return getOperand(1); }
+
+ /// Generate the wide load or store, and shuffles.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// The recipe only uses the first lane of the address, and EVL operand.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
- return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
+ return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) ||
+ Op == getEVL();
}
- Instruction *getInsertPos() const { return IG->getInsertPos(); }
+ unsigned getNumStoreOperands() const override {
+ return getNumOperands() - (HasMask ? 3 : 2);
+ }
+
+ ArrayRef<VPValue *> getStoredValues() const override {
+ // The first operand is the address, and the second operand is EVL, followed
+ // by the stored values, followe by an optional mask.
+ return ArrayRef<VPValue *>(op_begin(), getNumOperands())
+ .slice(2, getNumStoreOperands());
+ }
};
/// A recipe to represent inloop reduction operations, performing a reduction on
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 747c6623aa22a..ab7942024c063 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -296,7 +296,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
- .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
+ .Case<VPInterleaveRecipe, VPInterleaveEVLRecipe>([V](const auto *R) {
// TODO: Use info from interleave group.
return V->getUnderlyingValue()->getType();
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0368f9b8dbb00..47a43f3392c73 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -52,8 +52,9 @@ bool VPRecipeBase::mayWriteToMemory() const {
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
+ case VPInterleaveEVLSC:
case VPInterleaveSC:
- return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
+ return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
case VPWidenStoreEVLSC:
case VPWidenStoreSC:
return true;
@@ -107,6 +108,9 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
return true;
+ case VPInterleaveEVLSC:
+ case VPInterleaveSC:
+ return cast<VPInterleaveBase>(this)->getNumStoreOperands() == 0;
case VPReplicateSC:
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
->mayReadFromMemory();
@@ -183,6 +187,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
"underlying instruction has side-effects");
return false;
}
+ case VPInterleaveEVLSC:
case VPInterleaveSC:
return mayWriteToMemory();
case VPWidenLoadEVLSC:
@@ -255,7 +260,7 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
Instruction *UI = nullptr;
if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
- else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
+ else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
UI = IG->getInsertPos();
else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
UI = &WidenMem->getIngredient();
@@ -2174,7 +2179,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
if (VF.isScalar())
return TTI::CastContextHint::Normal;
- if (isa<VPInterleaveRecipe>(R))
+ if (isa<VPInterleaveBase>(R))
return TTI::CastContextHint::Interleave;
if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
@@ -3731,8 +3736,161 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
+void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
+ assert(!State.Lane && "Interleave group being replicated.");
+ assert(State.VF.isScalable() &&
+ "Only support scalable VF for EVL tail-folding.");
+ assert(!NeedsMaskForGaps &&
+ "Masking gaps for scalable vectors is not yet supported.");
+ const InterleaveGroup<Instruction> *Group = IG;
+ Instruction *Instr = Group->getInsertPos();
+
+ // Prepare for the vector type of the interleaved load/store.
+ Type *ScalarTy = getLoadStoreType(Instr);
+ unsigned InterleaveFactor = Group->getFactor();
+ assert(InterleaveFactor <= 8 &&
+ "Unsupported deinterleave/interleave factor for scalable vectors");
+ ElementCount WideVF = State.VF * InterleaveFactor;
+ auto *VecTy = VectorType::get(ScalarTy, WideVF);
+
+ VPValue *BlockInMask = getMask();
+ VPValue *Addr = getAddr();
+ Value *ResAddr = State.get(Addr, VPLane(0));
+ Value *EVL = State.get(getEVL(), VPLane(0));
+ LLVMContext &Ctx = State.Builder.getContext();
+
+ auto CreateGroupMask = [&BlockInMask, &State,
+ &InterleaveFactor]() -> Value * {
+ auto *ResBlockInMask = State.get(BlockInMask);
+ SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
+ return interleaveVectors(State.Builder, Ops, "interleaved.mask");
+ };
+
+ Value *GroupMask = nullptr;
+ if (BlockInMask)
+ GroupMask = CreateGroupMask();
+ else
+ GroupMask =
+ State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
+
+ const DataLayout &DL = Instr->getDataLayout();
+ // Vectorize the interleaved load group.
+ if (isa<LoadInst>(Instr)) {
+ CallInst *NewLoad = State.Builder.CreateIntrinsic(VecTy, Intrinsic::vp_load,
+ {ResAddr, GroupMask, EVL},
+ nullptr, "wide.vp.load");
+ NewLoad->addParamAttr(0,
+ Attribute::getWithAlignment(Ctx, Group->getAlign()));
+
+ applyMetadata(*NewLoad);
+ // TODO: Also manage existing metadata using VPIRMetadata.
+ Group->addMetadata(NewLoad);
+
+ ArrayRef<VPValue *> VPDefs = definedValues();
+ // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+ // so must use intrinsics to deinterleave.
+ NewLoad = State.Builder.CreateIntrinsic(
+ Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
+ NewLoad->getType(), NewLoad,
+ /*FMFSource=*/nullptr, "strided.vec");
+
+ for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
+
+ // Skip the gaps in the group.
+ if (!Member)
+ continue;
+
+ Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
+
+ // If this member has different type, cast the result type.
+ if (Member->getType() != ScalarTy) {
+ VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
+ StridedVec =
+ createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
+ }
+
+ State.set(VPDefs[J], StridedVec);
+ ++J;
+ }
+ return;
+ }
+
+ // The sub vector type for current instruction.
+ auto *SubVT = VectorType::get(ScalarTy, State.VF);
+
+ // Vectorize the interleaved store group.
+ ArrayRef<VPValue *> StoredValues = getStoredValues();
+ // Collect the stored vector from each member.
+ SmallVector<Value *, 4> StoredVecs;
+ unsigned StoredIdx = 0;
+ for (unsigned I = 0; I < InterleaveFactor; I++) {
+ Instruction *Member = Group->getMember(I);
+
+ // Skip the gaps in the group.
+ if (!Member) {
+ Value *Undef = PoisonValue::get(SubVT);
+ StoredVecs.push_back(Undef);
+ continue;
+ }
+
+ Value *StoredVec = State.get(StoredValues[StoredIdx]);
+ ++StoredIdx;
+
+ // If this member has different type, cast it to a unified type.
+ if (StoredVec->getType() != SubVT)
+ StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
+
+ StoredVecs.push_back(StoredVec);
+ }
+
+ // Interleave all the smaller vectors into one wider vector.
+ Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
+ CallInst *NewStore = State.Builder.CreateIntrinsic(
+ Type::getVoidTy(EVL->getContext()), Intrinsic::vp_store,
+ {IVec, ResAddr, GroupMask, EVL});
+ NewStore->addParamAttr(1,
+ Attribute::getWithAlignment(Ctx, Group->getAlign()));
+
+ applyMetadata(*NewStore);
+ // TODO: Also manage existing metadata using VPIRMetadata.
+ Group->addMetadata(NewStore);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+ IG->getInsertPos()->printAsOperand(O, false);
+ O << ", ";
+ getAddr()->printAsOperand(O, SlotTracker);
+ O << ", ";
+ getEVL()->printAsOperand(O, SlotTracker);
+ if (VPValue *Mask = getMask()) {
+ O << ", ";
+ Mask->printAsOperand(O, SlotTracker);
+ }
+
+ unsigned OpIdx = 0;
+ for (unsigned i = 0; i < IG->getFactor(); ++i) {
+ if (!IG->getMember(i))
+ continue;
+ if (getNumStoreOperands() > 0) {
+ O << "\n" << Indent << " vp.store ";
+ getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
+ O << " to index " << i;
+ } else {
+ O << "\n" << Indent << " ";
+ getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
+ O << " = vp.load from index " << i;
+ }
+ ++OpIdx;
+ }
+}
+#endif
+
+InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
Instruction *InsertPos = getInsertPos();
// Find the VPValue index of the interleave group. We need to skip gaps.
unsigned InsertPosIdx = 0;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e0bf241c73fdd..c0d2adaad4acc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2205,6 +2205,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPValue *NewAddr = GetNewAddr(S->getAddr());
return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
})
+ .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
+ VPValue *NewMask = GetNewMask(IR->getMask());
+ return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
+ })
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
VPValue *NewMask = GetNewMask(Red->getCondOp());
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
@@ -2327,16 +2331,17 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
if (!EVLRecipe)
continue;
- [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
+ unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
"New recipe must define the same number of values as the "
"original.");
- assert(NumDefVal <= 1 &&
- "Only supports recipes with a single definition or without users.");
EVLRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
- VPValue *CurVPV = CurRecipe->getVPSingleValue();
- CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPInterleaveEVLRecipe>(
+ EVLRecipe)) {
+ for (unsigned I = 0; I < NumDefVal; ++I) {
+ VPValue *CurVPV = CurRecipe->getVPValue(I);
+ CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
+ }
}
ToErase.push_back(CurRecipe);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 24f6d61512ef6..85c6c2c8d7965 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -38,7 +38,7 @@ struct VPDoubleValueDef;
class VPSlotTracker;
class VPUser;
class VPRecipeBase;
-class VPInterleaveRecipe;
+class VPInterleaveBase;
class VPPhiAccessors;
// This is the base class of the VPlan Def/Use graph, used for modeling the data
@@ -48,7 +48,7 @@ class VPPhiAccessors;
class LLVM_ABI_FOR_TEST VPValue {
friend class VPDef;
friend struct VPDoubleValueDef;
- friend class VPInterleaveRecipe;
+ friend class VPInterleaveBase;
friend class VPlan;
friend class VPExpressionRecipe;
@@ -335,6 +335,7 @@ class VPDef {
VPExpressionSC,
VPIRInstructionSC,
VPInstructionSC,
+ VPInterleaveEVLSC,
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index e25ffe135418e..d59ff8d410149 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -166,7 +166,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
}
return VerifyEVLUse(*R, 2);
})
- .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>(
+ .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe,
+ VPInterleaveEVLRecipe>(
[&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
.Case<VPInstructionWithType>(
[&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 29d901f084bdb..313e9e366144f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -401,18 +401,13 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP10]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 2, [[TMP16]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT: [[TMP14:%.*]] = icmp ult <vscale x 4 x i32> [[TMP13]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 2
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP14]])
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP22]], i32 1, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x i8> poison)
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP22]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP10]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index dc963f1bf264f..8537cdc1c6e4a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -13,22 +13,16 @@ define void @load_store_factor2_i32(ptr %p) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <vscale x 4 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP14:%.*]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP14]]
-; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP13]])
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.masked.load.nxv8i32.p0(ptr [[TMP15]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x i32> poison)
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i32> [[TMP8]], splat (i32 1)
; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP11]])
-; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP13]])
-; CHECK-NEXT: call void @llvm.masked.store.nxv8i32.p0(<vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK1]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
@@ -107,22 +101,16 @@ define void @load_store_factor2_i32(ptr %p) {
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP7]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; SCALABLE-NEXT: [[TMP13:%.*]] = icmp ult <vscale x 4 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP14:%.*]] = shl i64 [[INDEX]], 1
; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP14]]
-; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP13]])
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.masked.load.nxv8i32.p0(ptr [[TMP15]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x i32> poison)
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; SCALABLE-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i32> [[TMP8]], splat (i32 1)
; SCALABLE-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP11]])
-; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP13]])
-; SCALABLE-NEXT: call void @llvm.masked.store.nxv8i32.p0(<vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK1]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
@@ -184,22 +172,16 @@ define void @load_store_factor2_i64(ptr %p) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <vscale x 2 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
-; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]])
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr [[TMP14]], i32 8, <vscale x 4 x i1> [[INTERLEAVED_MASK]], <vscale x 4 x i64> poison)
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP20]], splat (i64 1)
; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP13]], <vscale x 2 x i64> [[TMP11]])
-; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]])
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i64.p0(<vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 8, <vscale x 4 x i1> [[INTERLEAVED_MASK1]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
@@ -278,22 +260,16 @@ define void @load_store_factor2_i64(ptr %p) {
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP7]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; SCALABLE-NEXT: [[TMP10:%.*]] = icmp ult <vscale x 2 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP8:%.*]] = shl i64 [[INDEX]], 1
; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
-; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]])
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr [[TMP14]], i32 8, <vscale x 4 x i1> [[INTERLEAVED_MASK]], <vscale x 4 x i64> poison)
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
; SCALABLE-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP20]], splat (i64 1)
; SCALABLE-NEXT: [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP13]], <vscale x 2 x i64> [[TMP11]])
-; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]])
-; SCALABLE-NEXT: call void @llvm.masked.store.nxv4i64.p0(<vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 8, <vscale x 4 x i1> [[INTERLEAVED_MASK1]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
@@ -355,14 +331,9 @@ define void @load_store_factor3_i32(ptr %p) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT: [[TMP14:%.*]] = icmp ult <vscale x 4 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP16]]
-; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 12 x i1> @llvm.vector.interleave3.nxv12i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP14]])
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.masked.load.nxv12i32.p0(ptr [[TMP17]], i32 4, <vscale x 12 x i1> [[INTERLEAVED_MASK]], <vscale x 12 x i32> poison)
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vp.load.nxv12i32.p0(ptr align 4 [[TMP17]], <vscale x 12 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
@@ -371,8 +342,7 @@ define void @load_store_factor3_i32(ptr %p) {
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 3)
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP13]])
-; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 12 x i1> @llvm.vector.interleave3.nxv12i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP14]])
-; CHECK-NEXT: call void @llvm.masked.store.nxv12i32.p0(<vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 4, <vscale x 12 x i1> [[INTERLEAVED_MASK1]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv12i32.p0(<vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr align 4 [[TMP17]], <vscale x 12 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
@@ -465,14 +435,9 @@ define void @load_store_factor3_i32(ptr %p) {
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP7]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; SCALABLE-NEXT: [[TMP14:%.*]] = icmp ult <vscale x 4 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP16:%.*]] = mul i64 [[INDEX]], 3
; SCALABLE-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP16]]
-; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 12 x i1> @llvm.vector.interleave3.nxv12i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP14]])
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.masked.load.nxv12i32.p0(ptr [[TMP17]], i32 4, <vscale x 12 x i1> [[INTERLEAVED_MASK]], <vscale x 12 x i32> poison)
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vp.load.nxv12i32.p0(ptr align 4 [[TMP17]], <vscale x 12 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
@@ -481,8 +446,7 @@ define void @load_store_factor3_i32(ptr %p) {
; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
; SCALABLE-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 3)
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP13]])
-; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 12 x i1> @llvm.vector.interleave3.nxv12i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP14]])
-; SCALABLE-NEXT: call void @llvm.masked.store.nxv12i32.p0(<vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 4, <vscale x 12 x i1> [[INTERLEAVED_MASK1]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv12i32.p0(<vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr align 4 [[TMP17]], <vscale x 12 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
@@ -555,14 +519,9 @@ define void @load_store_factor3_i64(ptr %p) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; CHECK-NEXT: [[TMP11:%.*]] = icmp ult <vscale x 2 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
-; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 6 x i1> @llvm.vector.interleave3.nxv6i1(<vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> [[TMP11]])
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.masked.load.nxv6i64.p0(ptr [[TMP14]], i32 8, <vscale x 6 x i1> [[INTERLEAVED_MASK]], <vscale x 6 x i64> poison)
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP14]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
@@ -571,8 +530,7 @@ define void @load_store_factor3_i64(ptr %p) {
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 3)
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> [[TMP25]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP13]])
-; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 6 x i1> @llvm.vector.interleave3.nxv6i1(<vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> [[TMP11]])
-; CHECK-NEXT: call void @llvm.masked.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 8, <vscale x 6 x i1> [[INTERLEAVED_MASK1]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
@@ -665,14 +623,9 @@ define void @load_store_factor3_i64(ptr %p) {
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP7]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ult <vscale x 2 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 3
; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
-; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 6 x i1> @llvm.vector.interleave3.nxv6i1(<vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> [[TMP11]])
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.masked.load.nxv6i64.p0(ptr [[TMP14]], i32 8, <vscale x 6 x i1> [[INTERLEAVED_MASK]], <vscale x 6 x i64> poison)
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP14]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
@@ -681,8 +634,7 @@ define void @load_store_factor3_i64(ptr %p) {
; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
; SCALABLE-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 3)
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> [[TMP25]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP13]])
-; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 6 x i1> @llvm.vector.interleave3.nxv6i1(<vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> [[TMP11]])
-; SCALABLE-NEXT: call void @llvm.masked.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 8, <vscale x 6 x i1> [[INTERLEAVED_MASK1]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
@@ -755,14 +707,9 @@ define void @load_store_factor4(ptr %p) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <vscale x 2 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
-; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]])
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.masked.load.nxv8i64.p0(ptr [[TMP9]], i32 8, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x i64> poison)
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
@@ -773,8 +720,7 @@ define void @load_store_factor4(ptr %p) {
; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP12]], splat (i64 3)
; CHECK-NEXT: [[TMP17:%.*]] = add <vscale x 2 x i64> [[TMP13]], splat (i64 4)
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> [[TMP26]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> [[TMP17]])
-; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]])
-; CHECK-NEXT: call void @llvm.masked.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], i32 8, <vscale x 8 x i1> [[INTERLEAVED_MASK1]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
@@ -879,14 +825,9 @@ define void @load_store_factor4(ptr %p) {
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP7]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; SCALABLE-NEXT: [[TMP10:%.*]] = icmp ult <vscale x 2 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 4
; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
-; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]])
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.masked.load.nxv8i64.p0(ptr [[TMP9]], i32 8, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x i64> poison)
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP24:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
@@ -897,8 +838,7 @@ define void @load_store_factor4(ptr %p) {
; SCALABLE-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP12]], splat (i64 3)
; SCALABLE-NEXT: [[TMP17:%.*]] = add <vscale x 2 x i64> [[TMP13]], splat (i64 4)
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> [[TMP26]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> [[TMP17]])
-; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[TMP10]])
-; SCALABLE-NEXT: call void @llvm.masked.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], i32 8, <vscale x 8 x i1> [[INTERLEAVED_MASK1]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
@@ -982,14 +922,9 @@ define void @load_store_factor5(ptr %p) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i32> poison, i32 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP18:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <vscale x 1 x i32> [[TMP18]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 5
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 5 x i1> @llvm.vector.interleave5.nxv5i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.masked.load.nxv5i64.p0(ptr [[TMP19]], i32 8, <vscale x 5 x i1> [[INTERLEAVED_MASK]], <vscale x 5 x i64> poison)
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vp.load.nxv5i64.p0(ptr align 8 [[TMP19]], <vscale x 5 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave5.nxv5i64(<vscale x 5 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1002,8 +937,7 @@ define void @load_store_factor5(ptr %p) {
; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 4)
; CHECK-NEXT: [[TMP17:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 5)
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vector.interleave5.nxv5i64(<vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]])
-; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 5 x i1> @llvm.vector.interleave5.nxv5i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; CHECK-NEXT: call void @llvm.masked.store.nxv5i64.p0(<vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 8, <vscale x 5 x i1> [[INTERLEAVED_MASK1]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv5i64.p0(<vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP19]], <vscale x 5 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP25]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]]
@@ -1122,14 +1056,9 @@ define void @load_store_factor5(ptr %p) {
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i32> poison, i32 [[TMP7]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP18:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; SCALABLE-NEXT: [[TMP5:%.*]] = icmp ult <vscale x 1 x i32> [[TMP18]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 5
; SCALABLE-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 5 x i1> @llvm.vector.interleave5.nxv5i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.masked.load.nxv5i64.p0(ptr [[TMP19]], i32 8, <vscale x 5 x i1> [[INTERLEAVED_MASK]], <vscale x 5 x i64> poison)
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vp.load.nxv5i64.p0(ptr align 8 [[TMP19]], <vscale x 5 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave5.nxv5i64(<vscale x 5 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1142,8 +1071,7 @@ define void @load_store_factor5(ptr %p) {
; SCALABLE-NEXT: [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 4)
; SCALABLE-NEXT: [[TMP17:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 5)
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vector.interleave5.nxv5i64(<vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]])
-; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 5 x i1> @llvm.vector.interleave5.nxv5i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; SCALABLE-NEXT: call void @llvm.masked.store.nxv5i64.p0(<vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 8, <vscale x 5 x i1> [[INTERLEAVED_MASK1]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv5i64.p0(<vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP19]], <vscale x 5 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[TMP25:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP25]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]]
@@ -1238,14 +1166,9 @@ define void @load_store_factor6(ptr %p) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i32> poison, i32 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP20:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <vscale x 1 x i32> [[TMP20]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 6
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 6 x i1> @llvm.vector.interleave6.nxv6i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.masked.load.nxv6i64.p0(ptr [[TMP21]], i32 8, <vscale x 6 x i1> [[INTERLEAVED_MASK]], <vscale x 6 x i64> poison)
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP21]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave6.nxv6i64(<vscale x 6 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1260,8 +1183,7 @@ define void @load_store_factor6(ptr %p) {
; CHECK-NEXT: [[TMP18:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 5)
; CHECK-NEXT: [[TMP19:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 6)
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave6.nxv6i64(<vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]], <vscale x 1 x i64> [[TMP19]])
-; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 6 x i1> @llvm.vector.interleave6.nxv6i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; CHECK-NEXT: call void @llvm.masked.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP21]], i32 8, <vscale x 6 x i1> [[INTERLEAVED_MASK1]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP21]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP28:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP28]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]]
@@ -1393,14 +1315,9 @@ define void @load_store_factor6(ptr %p) {
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i32> poison, i32 [[TMP7]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP20:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; SCALABLE-NEXT: [[TMP5:%.*]] = icmp ult <vscale x 1 x i32> [[TMP20]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 6
; SCALABLE-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 6 x i1> @llvm.vector.interleave6.nxv6i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.masked.load.nxv6i64.p0(ptr [[TMP21]], i32 8, <vscale x 6 x i1> [[INTERLEAVED_MASK]], <vscale x 6 x i64> poison)
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP21]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave6.nxv6i64(<vscale x 6 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1415,8 +1332,7 @@ define void @load_store_factor6(ptr %p) {
; SCALABLE-NEXT: [[TMP18:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 5)
; SCALABLE-NEXT: [[TMP19:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 6)
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave6.nxv6i64(<vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]], <vscale x 1 x i64> [[TMP19]])
-; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 6 x i1> @llvm.vector.interleave6.nxv6i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; SCALABLE-NEXT: call void @llvm.masked.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP21]], i32 8, <vscale x 6 x i1> [[INTERLEAVED_MASK1]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP21]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[TMP28:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP28]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]]
@@ -1522,14 +1438,9 @@ define void @load_store_factor7(ptr %p) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i32> poison, i32 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP22:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <vscale x 1 x i32> [[TMP22]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 7
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 7 x i1> @llvm.vector.interleave7.nxv7i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.masked.load.nxv7i64.p0(ptr [[TMP23]], i32 8, <vscale x 7 x i1> [[INTERLEAVED_MASK]], <vscale x 7 x i64> poison)
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vp.load.nxv7i64.p0(ptr align 8 [[TMP23]], <vscale x 7 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave7.nxv7i64(<vscale x 7 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1546,8 +1457,7 @@ define void @load_store_factor7(ptr %p) {
; CHECK-NEXT: [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 6)
; CHECK-NEXT: [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 7)
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vector.interleave7.nxv7i64(<vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]], <vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]])
-; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 7 x i1> @llvm.vector.interleave7.nxv7i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; CHECK-NEXT: call void @llvm.masked.store.nxv7i64.p0(<vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr [[TMP23]], i32 8, <vscale x 7 x i1> [[INTERLEAVED_MASK1]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv7i64.p0(<vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP23]], <vscale x 7 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP31:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP31]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP31]]
@@ -1693,14 +1603,9 @@ define void @load_store_factor7(ptr %p) {
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i32> poison, i32 [[TMP7]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP22:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; SCALABLE-NEXT: [[TMP5:%.*]] = icmp ult <vscale x 1 x i32> [[TMP22]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 7
; SCALABLE-NEXT: [[TMP23:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 7 x i1> @llvm.vector.interleave7.nxv7i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.masked.load.nxv7i64.p0(ptr [[TMP23]], i32 8, <vscale x 7 x i1> [[INTERLEAVED_MASK]], <vscale x 7 x i64> poison)
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vp.load.nxv7i64.p0(ptr align 8 [[TMP23]], <vscale x 7 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave7.nxv7i64(<vscale x 7 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1717,8 +1622,7 @@ define void @load_store_factor7(ptr %p) {
; SCALABLE-NEXT: [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 6)
; SCALABLE-NEXT: [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 7)
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vector.interleave7.nxv7i64(<vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]], <vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]])
-; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 7 x i1> @llvm.vector.interleave7.nxv7i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; SCALABLE-NEXT: call void @llvm.masked.store.nxv7i64.p0(<vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr [[TMP23]], i32 8, <vscale x 7 x i1> [[INTERLEAVED_MASK1]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv7i64.p0(<vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP23]], <vscale x 7 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[TMP31:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP31]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP31]]
@@ -1835,14 +1739,9 @@ define void @load_store_factor8(ptr %p) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i32> poison, i32 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <vscale x 1 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave8.nxv8i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.masked.load.nxv8i64.p0(ptr [[TMP24]], i32 8, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x i64> poison)
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP24]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave8.nxv8i64(<vscale x 8 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1861,8 +1760,7 @@ define void @load_store_factor8(ptr %p) {
; CHECK-NEXT: [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 7)
; CHECK-NEXT: [[TMP23:%.*]] = add <vscale x 1 x i64> [[TMP15]], splat (i64 8)
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave8.nxv8i64(<vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]], <vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP23]])
-; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave8.nxv8i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; CHECK-NEXT: call void @llvm.masked.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP24]], i32 8, <vscale x 8 x i1> [[INTERLEAVED_MASK1]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP24]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP34]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP34]]
@@ -2019,14 +1917,9 @@ define void @load_store_factor8(ptr %p) {
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i32> poison, i32 [[TMP7]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP4:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; SCALABLE-NEXT: [[TMP5:%.*]] = icmp ult <vscale x 1 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 3
; SCALABLE-NEXT: [[TMP24:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave8.nxv8i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.masked.load.nxv8i64.p0(ptr [[TMP24]], i32 8, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x i64> poison)
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP24]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave8.nxv8i64(<vscale x 8 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -2045,8 +1938,7 @@ define void @load_store_factor8(ptr %p) {
; SCALABLE-NEXT: [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 7)
; SCALABLE-NEXT: [[TMP23:%.*]] = add <vscale x 1 x i64> [[TMP15]], splat (i64 8)
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave8.nxv8i64(<vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]], <vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP23]])
-; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave8.nxv8i1(<vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[TMP5]])
-; SCALABLE-NEXT: call void @llvm.masked.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP24]], i32 8, <vscale x 8 x i1> [[INTERLEAVED_MASK1]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP24]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[TMP34:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP34]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP34]]
@@ -2174,14 +2066,9 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT: [[TMP12:%.*]] = icmp ult <vscale x 4 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP13]]
-; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[TMP12]], <vscale x 4 x i1> [[TMP12]])
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.masked.load.nxv8i32.p0(ptr [[TMP15]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x i32> poison)
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
@@ -2262,14 +2149,9 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP7]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; SCALABLE-NEXT: [[TMP12:%.*]] = icmp ult <vscale x 4 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1
; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP13]]
-; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[TMP12]], <vscale x 4 x i1> [[TMP12]])
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.masked.load.nxv8i32.p0(ptr [[TMP15]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x i32> poison)
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
@@ -2337,14 +2219,9 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; CHECK-NEXT: [[TMP12:%.*]] = icmp ult <vscale x 2 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP13]]
-; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> [[TMP12]], <vscale x 2 x i1> [[TMP12]])
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr [[TMP15]], i32 8, <vscale x 4 x i1> [[INTERLEAVED_MASK]], <vscale x 4 x i64> poison)
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
@@ -2425,14 +2302,9 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP7]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; SCALABLE-NEXT: [[TMP12:%.*]] = icmp ult <vscale x 2 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1
; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP13]]
-; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> [[TMP12]], <vscale x 2 x i1> [[TMP12]])
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr [[TMP15]], i32 8, <vscale x 4 x i1> [[INTERLEAVED_MASK]], <vscale x 4 x i64> poison)
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index 48e8a1dac348b..f581442112eea 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -113,32 +113,29 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = phi i32 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
-; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = icmp ult <vscale x 16 x i32> [[TMP2]], [[BROADCAST_SPLAT4]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = select <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = shl i32 [[EVL_BASED_IV]], 1
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP8]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
-; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = sext i32 [[TMP6]] to i64
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP12]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP11]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP14]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK5:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]])
-; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP13]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl i32 [[EVL_BASED_IV]], 1
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_VP_LOAD:%.*]] = call <vscale x 32 x i8> @llvm.vp.load.nxv32i8.p0(ptr align 1 [[TMP5]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], i32 [[TMP1]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_VP_LOAD]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = sext i32 [[TMP3]] to i64
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP9]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP8]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP11]])
+; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP10]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]], i32 [[TMP1]])
; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]]
; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024
-; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024
+; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; PREDICATED_DATA-WITH-EVL: middle.block:
; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]]
; PREDICATED_DATA-WITH-EVL: scalar.ph:
@@ -294,36 +291,33 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = phi i32 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
-; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = icmp ult <vscale x 16 x i32> [[TMP2]], [[BROADCAST_SPLAT4]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = select <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = shl i32 [[EVL_BASED_IV]], 2
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP8]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
-; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP15]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = sext i32 [[TMP6]] to i64
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP17]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK5:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]])
-; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl i32 [[EVL_BASED_IV]], 2
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_VP_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr align 1 [[TMP5]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], i32 [[TMP1]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_VP_LOAD]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP10]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP12]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sext i32 [[TMP3]] to i64
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]])
+; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP15]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]], i32 [[TMP1]])
; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]]
; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024
-; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024
+; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; PREDICATED_DATA-WITH-EVL: middle.block:
; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]]
; PREDICATED_DATA-WITH-EVL: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index 332c16e8eb656..fedb8fc7cf51c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -7,7 +7,6 @@
; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
-; FIXME: interleaved accesses are not supported yet with predicated vectorization.
define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-LABEL: @interleave(
; IF-EVL-NEXT: entry:
@@ -17,25 +16,20 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ult <vscale x 4 x i32> [[TMP16]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[EVL_BASED_IV]], i32 0
-; IF-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[TMP17]])
-; IF-EVL-NEXT: [[WIDE_VEC:%.*]] = call <vscale x 8 x i32> @llvm.masked.load.nxv8i32.p0(ptr [[TMP6]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x i32> poison)
+; IF-EVL-NEXT: [[WIDE_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP6]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP16]])
; IF-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; IF-EVL-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; IF-EVL-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; IF-EVL-NEXT: [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[TMP15]], [[TMP14]]
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
-; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
-; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP16]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
; IF-EVL: scalar.ph:
>From 537976c15d36def9caa37414167b56408595d7c5 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 11 Aug 2025 01:31:16 -0700
Subject: [PATCH 02/13] use base class to get type
---
llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index ab7942024c063..d400ceff7797c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -296,7 +296,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
- .Case<VPInterleaveRecipe, VPInterleaveEVLRecipe>([V](const auto *R) {
+ .Case<VPInterleaveBase>([V](const auto *R) {
// TODO: Use info from interleave group.
return V->getUnderlyingValue()->getType();
})
>From c622a71e0021acfd295d42b7b4af92171e4c54bd Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 11 Aug 2025 03:35:11 -0700
Subject: [PATCH 03/13] Unify getStoredValues()
---
llvm/lib/Transforms/Vectorize/VPlan.h | 20 +++++---------------
1 file changed, 5 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1f54fd61a12c3..9424279f73793 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2529,7 +2529,11 @@ class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase,
/// Return the VPValues stored by this interleave group. If it is a load
/// interleave group, return an empty ArrayRef.
- virtual ArrayRef<VPValue *> getStoredValues() const = 0;
+ ArrayRef<VPValue *> getStoredValues() const {
+ return ArrayRef<VPValue *>(op_end() -
+ (getNumStoreOperands() + (HasMask ? 1 : 0)),
+ getNumStoreOperands());
+ }
};
/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
@@ -2571,13 +2575,6 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase {
unsigned getNumStoreOperands() const override {
return getNumOperands() - (HasMask ? 2 : 1);
}
-
- ArrayRef<VPValue *> getStoredValues() const override {
- // The first operand is the address, followed by the stored values, followed
- // by an optional mask.
- return ArrayRef<VPValue *>(op_begin(), getNumOperands())
- .slice(1, getNumStoreOperands());
- }
};
/// A recipe for interleaved access operations with vector-predication
@@ -2627,13 +2624,6 @@ class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase {
unsigned getNumStoreOperands() const override {
return getNumOperands() - (HasMask ? 3 : 2);
}
-
- ArrayRef<VPValue *> getStoredValues() const override {
- // The first operand is the address, and the second operand is EVL, followed
- // by the stored values, followe by an optional mask.
- return ArrayRef<VPValue *>(op_begin(), getNumOperands())
- .slice(2, getNumStoreOperands());
- }
};
/// A recipe to represent inloop reduction operations, performing a reduction on
>From b0f24336856dae5874aea018fa563f9ae0bf4cb1 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 13 Aug 2025 00:10:59 -0700
Subject: [PATCH 04/13] Remove the mayReadFromMemory setting
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 47a43f3392c73..0a41509b19bb2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -108,9 +108,6 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
return true;
- case VPInterleaveEVLSC:
- case VPInterleaveSC:
- return cast<VPInterleaveBase>(this)->getNumStoreOperands() == 0;
case VPReplicateSC:
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
->mayReadFromMemory();
@@ -146,6 +143,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
return false;
}
default:
+ // FIXME: Return false if the recipe represents an interleaved store.
return true;
}
}
>From 4dba7c227673db8935b08c2c74c9f51433186624 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 15 Aug 2025 00:15:58 -0700
Subject: [PATCH 05/13] Fix the interleave EVL
---
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 15 ++-
.../LoopVectorize/RISCV/dead-ops-cost.ll | 3 +-
.../RISCV/interleaved-accesses.ll | 120 ++++++++++++------
.../RISCV/interleaved-masked-access.ll | 16 ++-
.../RISCV/tail-folding-interleave.ll | 3 +-
5 files changed, 103 insertions(+), 54 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0a41509b19bb2..26c59a400bc23 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3755,6 +3755,9 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
VPValue *Addr = getAddr();
Value *ResAddr = State.get(Addr, VPLane(0));
Value *EVL = State.get(getEVL(), VPLane(0));
+ Value *InterleaveEVL = State.Builder.CreateMul(
+ EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
+ /* NUW= */ true, /* NSW= */ true);
LLVMContext &Ctx = State.Builder.getContext();
auto CreateGroupMask = [&BlockInMask, &State,
@@ -3774,9 +3777,9 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
const DataLayout &DL = Instr->getDataLayout();
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
- CallInst *NewLoad = State.Builder.CreateIntrinsic(VecTy, Intrinsic::vp_load,
- {ResAddr, GroupMask, EVL},
- nullptr, "wide.vp.load");
+ CallInst *NewLoad = State.Builder.CreateIntrinsic(
+ VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
+ "wide.vp.load");
NewLoad->addParamAttr(0,
Attribute::getWithAlignment(Ctx, Group->getAlign()));
@@ -3844,9 +3847,9 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
// Interleave all the smaller vectors into one wider vector.
Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
- CallInst *NewStore = State.Builder.CreateIntrinsic(
- Type::getVoidTy(EVL->getContext()), Intrinsic::vp_store,
- {IVec, ResAddr, GroupMask, EVL});
+ CallInst *NewStore =
+ State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
+ {IVec, ResAddr, GroupMask, InterleaveEVL});
NewStore->addParamAttr(1,
Attribute::getWithAlignment(Ctx, Group->getAlign()));
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 313e9e366144f..f79998b2b81c4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -407,7 +407,8 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 2
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP22]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP10]])
+; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP10]], 2
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP22]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 8537cdc1c6e4a..f4d74bc129503 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -15,14 +15,16 @@ define void @load_store_factor2_i32(ptr %p) {
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; CHECK-NEXT: [[TMP14:%.*]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP14]]
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 2
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i32> [[TMP8]], splat (i32 1)
; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
+; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 2
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP11]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
@@ -103,14 +105,16 @@ define void @load_store_factor2_i32(ptr %p) {
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; SCALABLE-NEXT: [[TMP14:%.*]] = shl i64 [[INDEX]], 1
; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP14]]
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 2
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; SCALABLE-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i32> [[TMP8]], splat (i32 1)
; SCALABLE-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
+; SCALABLE-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 2
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP11]])
-; SCALABLE-NEXT: call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; SCALABLE-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
@@ -174,14 +178,16 @@ define void @load_store_factor2_i64(ptr %p) {
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 2
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP20]], splat (i64 1)
; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
+; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 2
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP13]], <vscale x 2 x i64> [[TMP11]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
@@ -262,14 +268,16 @@ define void @load_store_factor2_i64(ptr %p) {
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; SCALABLE-NEXT: [[TMP8:%.*]] = shl i64 [[INDEX]], 1
; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 2
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
; SCALABLE-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP20]], splat (i64 1)
; SCALABLE-NEXT: [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
+; SCALABLE-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 2
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP13]], <vscale x 2 x i64> [[TMP11]])
-; SCALABLE-NEXT: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; SCALABLE-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
@@ -333,7 +341,8 @@ define void @load_store_factor3_i32(ptr %p) {
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP16]]
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vp.load.nxv12i32.p0(ptr align 4 [[TMP17]], <vscale x 12 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 3
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vp.load.nxv12i32.p0(ptr align 4 [[TMP17]], <vscale x 12 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
@@ -341,8 +350,9 @@ define void @load_store_factor3_i32(ptr %p) {
; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP8]], splat (i32 1)
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 3)
+; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 3
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP13]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv12i32.p0(<vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr align 4 [[TMP17]], <vscale x 12 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv12i32.p0(<vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr align 4 [[TMP17]], <vscale x 12 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
@@ -437,7 +447,8 @@ define void @load_store_factor3_i32(ptr %p) {
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; SCALABLE-NEXT: [[TMP16:%.*]] = mul i64 [[INDEX]], 3
; SCALABLE-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP16]]
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vp.load.nxv12i32.p0(ptr align 4 [[TMP17]], <vscale x 12 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 3
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vp.load.nxv12i32.p0(ptr align 4 [[TMP17]], <vscale x 12 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
@@ -445,8 +456,9 @@ define void @load_store_factor3_i32(ptr %p) {
; SCALABLE-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP8]], splat (i32 1)
; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
; SCALABLE-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 3)
+; SCALABLE-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 3
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP13]])
-; SCALABLE-NEXT: call void @llvm.vp.store.nxv12i32.p0(<vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr align 4 [[TMP17]], <vscale x 12 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv12i32.p0(<vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr align 4 [[TMP17]], <vscale x 12 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; SCALABLE-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
@@ -521,7 +533,8 @@ define void @load_store_factor3_i64(ptr %p) {
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP14]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 3
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP14]], <vscale x 6 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
@@ -529,8 +542,9 @@ define void @load_store_factor3_i64(ptr %p) {
; CHECK-NEXT: [[TMP25:%.*]] = add <vscale x 2 x i64> [[TMP23]], splat (i64 1)
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 3)
+; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 3
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> [[TMP25]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP13]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], <vscale x 6 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
@@ -625,7 +639,8 @@ define void @load_store_factor3_i64(ptr %p) {
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 3
; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP14]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 3
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP14]], <vscale x 6 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
@@ -633,8 +648,9 @@ define void @load_store_factor3_i64(ptr %p) {
; SCALABLE-NEXT: [[TMP25:%.*]] = add <vscale x 2 x i64> [[TMP23]], splat (i64 1)
; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
; SCALABLE-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 3)
+; SCALABLE-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 3
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> [[TMP25]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP13]])
-; SCALABLE-NEXT: call void @llvm.vp.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], <vscale x 6 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; SCALABLE-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
@@ -709,7 +725,8 @@ define void @load_store_factor4(ptr %p) {
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 4
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
@@ -719,8 +736,9 @@ define void @load_store_factor4(ptr %p) {
; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP12]], splat (i64 3)
; CHECK-NEXT: [[TMP17:%.*]] = add <vscale x 2 x i64> [[TMP13]], splat (i64 4)
+; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 4
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> [[TMP26]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> [[TMP17]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
@@ -827,7 +845,8 @@ define void @load_store_factor4(ptr %p) {
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 4
; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 4
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP24:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
@@ -837,8 +856,9 @@ define void @load_store_factor4(ptr %p) {
; SCALABLE-NEXT: [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
; SCALABLE-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP12]], splat (i64 3)
; SCALABLE-NEXT: [[TMP17:%.*]] = add <vscale x 2 x i64> [[TMP13]], splat (i64 4)
+; SCALABLE-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 4
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> [[TMP26]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> [[TMP17]])
-; SCALABLE-NEXT: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; SCALABLE-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
@@ -924,7 +944,8 @@ define void @load_store_factor5(ptr %p) {
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 5
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vp.load.nxv5i64.p0(ptr align 8 [[TMP19]], <vscale x 5 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 5
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vp.load.nxv5i64.p0(ptr align 8 [[TMP19]], <vscale x 5 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave5.nxv5i64(<vscale x 5 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -936,8 +957,9 @@ define void @load_store_factor5(ptr %p) {
; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 3)
; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 4)
; CHECK-NEXT: [[TMP17:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 5)
+; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 5
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vector.interleave5.nxv5i64(<vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv5i64.p0(<vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP19]], <vscale x 5 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv5i64.p0(<vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP19]], <vscale x 5 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP25]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]]
@@ -1058,7 +1080,8 @@ define void @load_store_factor5(ptr %p) {
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 5
; SCALABLE-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vp.load.nxv5i64.p0(ptr align 8 [[TMP19]], <vscale x 5 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 5
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vp.load.nxv5i64.p0(ptr align 8 [[TMP19]], <vscale x 5 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave5.nxv5i64(<vscale x 5 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1070,8 +1093,9 @@ define void @load_store_factor5(ptr %p) {
; SCALABLE-NEXT: [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 3)
; SCALABLE-NEXT: [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 4)
; SCALABLE-NEXT: [[TMP17:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 5)
+; SCALABLE-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 5
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vector.interleave5.nxv5i64(<vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]])
-; SCALABLE-NEXT: call void @llvm.vp.store.nxv5i64.p0(<vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP19]], <vscale x 5 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv5i64.p0(<vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP19]], <vscale x 5 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; SCALABLE-NEXT: [[TMP25:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP25]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]]
@@ -1168,7 +1192,8 @@ define void @load_store_factor6(ptr %p) {
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 6
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP21]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 6
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP21]], <vscale x 6 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave6.nxv6i64(<vscale x 6 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1182,8 +1207,9 @@ define void @load_store_factor6(ptr %p) {
; CHECK-NEXT: [[TMP17:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 4)
; CHECK-NEXT: [[TMP18:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 5)
; CHECK-NEXT: [[TMP19:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 6)
+; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 6
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave6.nxv6i64(<vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]], <vscale x 1 x i64> [[TMP19]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP21]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP21]], <vscale x 6 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; CHECK-NEXT: [[TMP28:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP28]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]]
@@ -1317,7 +1343,8 @@ define void @load_store_factor6(ptr %p) {
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 6
; SCALABLE-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP21]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 6
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP21]], <vscale x 6 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave6.nxv6i64(<vscale x 6 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1331,8 +1358,9 @@ define void @load_store_factor6(ptr %p) {
; SCALABLE-NEXT: [[TMP17:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 4)
; SCALABLE-NEXT: [[TMP18:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 5)
; SCALABLE-NEXT: [[TMP19:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 6)
+; SCALABLE-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 6
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave6.nxv6i64(<vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]], <vscale x 1 x i64> [[TMP19]])
-; SCALABLE-NEXT: call void @llvm.vp.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP21]], <vscale x 6 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv6i64.p0(<vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP21]], <vscale x 6 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; SCALABLE-NEXT: [[TMP28:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP28]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]]
@@ -1440,7 +1468,8 @@ define void @load_store_factor7(ptr %p) {
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 7
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vp.load.nxv7i64.p0(ptr align 8 [[TMP23]], <vscale x 7 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 7
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vp.load.nxv7i64.p0(ptr align 8 [[TMP23]], <vscale x 7 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave7.nxv7i64(<vscale x 7 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1456,8 +1485,9 @@ define void @load_store_factor7(ptr %p) {
; CHECK-NEXT: [[TMP19:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 5)
; CHECK-NEXT: [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 6)
; CHECK-NEXT: [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 7)
+; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 7
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vector.interleave7.nxv7i64(<vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]], <vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv7i64.p0(<vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP23]], <vscale x 7 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv7i64.p0(<vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP23]], <vscale x 7 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; CHECK-NEXT: [[TMP31:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP31]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP31]]
@@ -1605,7 +1635,8 @@ define void @load_store_factor7(ptr %p) {
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 7
; SCALABLE-NEXT: [[TMP23:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vp.load.nxv7i64.p0(ptr align 8 [[TMP23]], <vscale x 7 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 7
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vp.load.nxv7i64.p0(ptr align 8 [[TMP23]], <vscale x 7 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave7.nxv7i64(<vscale x 7 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1621,8 +1652,9 @@ define void @load_store_factor7(ptr %p) {
; SCALABLE-NEXT: [[TMP19:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 5)
; SCALABLE-NEXT: [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 6)
; SCALABLE-NEXT: [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 7)
+; SCALABLE-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 7
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vector.interleave7.nxv7i64(<vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]], <vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]])
-; SCALABLE-NEXT: call void @llvm.vp.store.nxv7i64.p0(<vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP23]], <vscale x 7 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv7i64.p0(<vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP23]], <vscale x 7 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; SCALABLE-NEXT: [[TMP31:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP31]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP31]]
@@ -1741,7 +1773,8 @@ define void @load_store_factor8(ptr %p) {
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP24]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 8
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP24]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave8.nxv8i64(<vscale x 8 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1759,8 +1792,9 @@ define void @load_store_factor8(ptr %p) {
; CHECK-NEXT: [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 6)
; CHECK-NEXT: [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 7)
; CHECK-NEXT: [[TMP23:%.*]] = add <vscale x 1 x i64> [[TMP15]], splat (i64 8)
+; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 8
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave8.nxv8i64(<vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]], <vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP23]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP24]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP24]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP34]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP34]]
@@ -1919,7 +1953,8 @@ define void @load_store_factor8(ptr %p) {
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
; SCALABLE-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 3
; SCALABLE-NEXT: [[TMP24:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP24]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 8
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP24]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave8.nxv8i64(<vscale x 8 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
@@ -1937,8 +1972,9 @@ define void @load_store_factor8(ptr %p) {
; SCALABLE-NEXT: [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 6)
; SCALABLE-NEXT: [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 7)
; SCALABLE-NEXT: [[TMP23:%.*]] = add <vscale x 1 x i64> [[TMP15]], splat (i64 8)
+; SCALABLE-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP7]], 8
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave8.nxv8i64(<vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]], <vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP23]])
-; SCALABLE-NEXT: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP24]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP24]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
; SCALABLE-NEXT: [[TMP34:%.*]] = zext i32 [[TMP7]] to i64
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP34]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP34]]
@@ -2068,7 +2104,8 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP13]]
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 2
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
@@ -2151,7 +2188,8 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; SCALABLE-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1
; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP13]]
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 2
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
@@ -2221,7 +2259,8 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP13]]
-; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 2
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_MASKED_VEC]])
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
@@ -2304,7 +2343,8 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; SCALABLE-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1
; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP13]]
-; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP7]], 2
+; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_MASKED_VEC]])
; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index f581442112eea..4333422794abf 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -119,8 +119,9 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl i32 [[EVL_BASED_IV]], 1
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVE_EVL:%.*]] = shl nuw nsw i32 [[TMP1]], 1
; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_VP_LOAD:%.*]] = call <vscale x 32 x i8> @llvm.vp.load.nxv32i8.p0(ptr align 1 [[TMP5]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], i32 [[TMP1]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_VP_LOAD:%.*]] = call <vscale x 32 x i8> @llvm.vp.load.nxv32i8.p0(ptr align 1 [[TMP5]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], i32 [[INTERLEAVE_EVL]])
; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_VP_LOAD]])
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -128,9 +129,10 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = sext i32 [[TMP3]] to i64
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP9]]
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP8]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVE_EVL3:%.*]] = shl nuw nsw i32 [[TMP1]], 1
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK4:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP11]])
-; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP10]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]], i32 [[TMP1]])
+; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP10]], <vscale x 32 x i1> [[INTERLEAVED_MASK4]], i32 [[INTERLEAVE_EVL3]])
; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]]
; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
@@ -297,8 +299,9 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl i32 [[EVL_BASED_IV]], 2
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVE_EVL:%.*]] = shl nuw nsw i32 [[TMP1]], 2
; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_VP_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr align 1 [[TMP5]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], i32 [[TMP1]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_VP_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr align 1 [[TMP5]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], i32 [[INTERLEAVE_EVL]])
; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_VP_LOAD]])
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -310,9 +313,10 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP12]]
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sext i32 [[TMP3]] to i64
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVE_EVL3:%.*]] = shl nuw nsw i32 [[TMP1]], 2
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK4:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]])
-; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP15]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]], i32 [[TMP1]])
+; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP15]], <vscale x 64 x i1> [[INTERLEAVED_MASK4]], i32 [[INTERLEAVE_EVL3]])
; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]]
; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index fedb8fc7cf51c..4a8d79b685fe1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -18,7 +18,8 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[EVL_BASED_IV]], i32 0
-; IF-EVL-NEXT: [[WIDE_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP6]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP16]])
+; IF-EVL-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP16]], 2
+; IF-EVL-NEXT: [[WIDE_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP6]], <vscale x 8 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
; IF-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; IF-EVL-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; IF-EVL-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
>From 3ad90f5af45035a9de09e685a54c2da5eab0eec7 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 18 Aug 2025 03:06:48 -0700
Subject: [PATCH 06/13] flatten CreateGroupMask
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 16 +++++-----------
1 file changed, 5 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 26c59a400bc23..198192f24833e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3751,7 +3751,6 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
ElementCount WideVF = State.VF * InterleaveFactor;
auto *VecTy = VectorType::get(ScalarTy, WideVF);
- VPValue *BlockInMask = getMask();
VPValue *Addr = getAddr();
Value *ResAddr = State.get(Addr, VPLane(0));
Value *EVL = State.get(getEVL(), VPLane(0));
@@ -3760,19 +3759,14 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
/* NUW= */ true, /* NSW= */ true);
LLVMContext &Ctx = State.Builder.getContext();
- auto CreateGroupMask = [&BlockInMask, &State,
- &InterleaveFactor]() -> Value * {
- auto *ResBlockInMask = State.get(BlockInMask);
- SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
- return interleaveVectors(State.Builder, Ops, "interleaved.mask");
- };
-
Value *GroupMask = nullptr;
- if (BlockInMask)
- GroupMask = CreateGroupMask();
- else
+ if (VPValue *BlockInMask = getMask()) {
+ SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
+ GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
+ } else {
GroupMask =
State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
+ }
const DataLayout &DL = Instr->getDataLayout();
// Vectorize the interleaved load group.
>From 7e52ff508c631a7ebffe7b64aa4c86ee58c3213d Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 18 Aug 2025 03:15:06 -0700
Subject: [PATCH 07/13] singleton arrayref constructor
---
llvm/lib/Transforms/Vectorize/VPlan.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9424279f73793..a696488e2098b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2545,8 +2545,8 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase {
VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
ArrayRef<VPValue *> StoredValues, VPValue *Mask,
bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
- : VPInterleaveBase(VPDef::VPInterleaveSC, IG, ArrayRef<VPValue *>({Addr}),
- StoredValues, Mask, NeedsMaskForGaps, MD, DL) {}
+ : VPInterleaveBase(VPDef::VPInterleaveSC, IG, Addr, StoredValues, Mask,
+ NeedsMaskForGaps, MD, DL) {}
~VPInterleaveRecipe() override = default;
>From cc20825c5fb32a668ec058e3f1e1a42fd89d75c1 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 18 Aug 2025 06:08:32 -0700
Subject: [PATCH 08/13] mark VPInterleaveBase::onlyFirstLaneUsed as override
---
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a696488e2098b..64980a44763c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2521,7 +2521,7 @@ class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase,
VPCostContext &Ctx) const override;
/// Returns true if the recipe only uses the first lane of operand \p Op.
- virtual bool onlyFirstLaneUsed(const VPValue *Op) const = 0;
+ virtual bool onlyFirstLaneUsed(const VPValue *Op) const override = 0;
/// Returns the number of stored operands of this interleave group. Returns 0
/// for load interleave groups.
>From 6df82f3be785215d19307f12f526cc33b2490766 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 25 Aug 2025 00:45:52 -0700
Subject: [PATCH 09/13] nfc, private member as possible.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 19 ++++++++++---------
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 14 ++++++++------
2 files changed, 18 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 64980a44763c2..c42833d3722f6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2442,17 +2442,17 @@ class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe {
/// the stored values and the mask.
class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase,
public VPIRMetadata {
-protected:
const InterleaveGroup<Instruction> *IG;
- /// Indicates if the interleave group is in a conditional block and requires a
- /// mask.
- bool HasMask = false;
-
/// Indicates if gaps between members of the group need to be masked out or if
/// unusued gaps can be loaded speculatively.
bool NeedsMaskForGaps = false;
+protected:
+ /// Indicates if the interleave group is in a conditional block and requires a
+ /// mask.
+ bool HasMask = false;
+
VPInterleaveBase(const unsigned char SC,
const InterleaveGroup<Instruction> *IG,
ArrayRef<VPValue *> Operands,
@@ -2508,7 +2508,7 @@ class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase,
/// Return true if the access needs a mask because of the gaps.
bool needsMaskForGaps() const { return NeedsMaskForGaps; }
- const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
+ const InterleaveGroup<Instruction> *getInterleaveGroup() const { return IG; }
Instruction *getInsertPos() const { return IG->getInsertPos(); }
@@ -2551,8 +2551,9 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase {
~VPInterleaveRecipe() override = default;
VPInterleaveRecipe *clone() override {
- return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(),
- NeedsMaskForGaps, *this, getDebugLoc());
+ return new VPInterleaveRecipe(getInterleaveGroup(), getAddr(),
+ getStoredValues(), getMask(),
+ needsMaskForGaps(), *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
@@ -2587,7 +2588,7 @@ class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase {
ArrayRef<VPValue *>({R.getAddr(), &EVL}),
R.getStoredValues(), Mask, R.needsMaskForGaps(), R,
R.getDebugLoc()) {
- assert(!IG->isReverse() &&
+ assert(!getInterleaveGroup()->isReverse() &&
"Reversed interleave-group with tail folding is not supported.");
assert(!needsMaskForGaps() && "Interleaved access with gap mask is not "
"supported for scalable vector.");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 198192f24833e..24268b28d183c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3536,9 +3536,9 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "Interleave group being replicated.");
- assert((!NeedsMaskForGaps || !State.VF.isScalable()) &&
+ assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
"Masking gaps for scalable vectors is not yet supported.");
- const InterleaveGroup<Instruction> *Group = IG;
+ const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
Instruction *Instr = Group->getInsertPos();
// Prepare for the vector type of the interleaved load/store.
@@ -3578,7 +3578,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
Value *MaskForGaps = nullptr;
- if (NeedsMaskForGaps) {
+ if (needsMaskForGaps()) {
MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
assert(MaskForGaps && "Mask for Gaps is required but it is null");
@@ -3655,7 +3655,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
// Vectorize the interleaved store group.
Value *MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
- assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) &&
+ assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
"Mismatch between NeedsMaskForGaps and MaskForGaps");
ArrayRef<VPValue *> StoredValues = getStoredValues();
// Collect the stored vector from each member.
@@ -3706,6 +3706,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
+ const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
O << ", ";
@@ -3738,9 +3739,9 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "Interleave group being replicated.");
assert(State.VF.isScalable() &&
"Only support scalable VF for EVL tail-folding.");
- assert(!NeedsMaskForGaps &&
+ assert(!needsMaskForGaps() &&
"Masking gaps for scalable vectors is not yet supported.");
- const InterleaveGroup<Instruction> *Group = IG;
+ const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
Instruction *Instr = Group->getInsertPos();
// Prepare for the vector type of the interleaved load/store.
@@ -3855,6 +3856,7 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
+ const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
O << ", ";
>From 7ee07792ad6c24b0e0f75498c54b6d2678f39b21 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 25 Aug 2025 00:54:54 -0700
Subject: [PATCH 10/13] nfc, set VPInterleaveBase::clone() as pure
---
llvm/lib/Transforms/Vectorize/VPlan.h | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c42833d3722f6..b41aeffd21489 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2479,9 +2479,7 @@ class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase,
}
public:
- VPInterleaveBase *clone() override {
- llvm_unreachable("cloning not supported");
- }
+ VPInterleaveBase *clone() override = 0;
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPInterleaveSC ||
>From 9160fbbe14804b8f9df0ce0793ae3705d4710a23 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 25 Aug 2025 01:04:06 -0700
Subject: [PATCH 11/13] nfc, move getDataLayout
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 24268b28d183c..5896e04b7c100 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3769,7 +3769,6 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
}
- const DataLayout &DL = Instr->getDataLayout();
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
CallInst *NewLoad = State.Builder.CreateIntrinsic(
@@ -3790,6 +3789,7 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
NewLoad->getType(), NewLoad,
/*FMFSource=*/nullptr, "strided.vec");
+ const DataLayout &DL = Instr->getDataLayout();
for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
Instruction *Member = Group->getMember(I);
@@ -3820,6 +3820,7 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
// Collect the stored vector from each member.
SmallVector<Value *, 4> StoredVecs;
unsigned StoredIdx = 0;
+ const DataLayout &DL = Instr->getDataLayout();
for (unsigned I = 0; I < InterleaveFactor; I++) {
Instruction *Member = Group->getMember(I);
>From fa84b66515a3981a6c3a9cc9d718a8c1a6966717 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 25 Aug 2025 01:05:37 -0700
Subject: [PATCH 12/13] nfc, refine comment
---
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index b41aeffd21489..7e579aa84c039 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2578,7 +2578,7 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase {
/// A recipe for interleaved access operations with vector-predication
/// intrinsics. The first operand is the address, the second operand is the
-/// explicit vector length . Stored values and mask are optional operands.
+/// explicit vector length. Stored values and mask are optional operands.
class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase {
public:
VPInterleaveEVLRecipe(VPInterleaveRecipe &R, VPValue &EVL, VPValue *Mask)
>From cd9e90764ef71cf1acf25eed5aa7d5ca2054aa5e Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 25 Aug 2025 01:09:17 -0700
Subject: [PATCH 13/13] nfc, replace definedValues() with getVPValue(J)
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5896e04b7c100..ebb0fecb9189a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3781,7 +3781,6 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
// TODO: Also manage existing metadata using VPIRMetadata.
Group->addMetadata(NewLoad);
- ArrayRef<VPValue *> VPDefs = definedValues();
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
NewLoad = State.Builder.CreateIntrinsic(
@@ -3806,7 +3805,7 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
}
- State.set(VPDefs[J], StridedVec);
+ State.set(getVPValue(J), StridedVec);
++J;
}
return;
More information about the llvm-commits
mailing list