[llvm] [LV][EVL] Support interleaved access with tail folding by EVL (PR #152070)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 5 01:13:03 PDT 2025
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/152070
>From cd02d05a8d4b6558304af206eb4108655c9b5c4a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 21 Jul 2025 08:06:02 -0700
Subject: [PATCH] Support EVL interleave access
---
.../Transforms/Vectorize/LoopVectorize.cpp | 4 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 178 ++++++++++++++----
.../Transforms/Vectorize/VPlanAnalysis.cpp | 2 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 162 +++++++++++++++-
.../Transforms/Vectorize/VPlanTransforms.cpp | 19 +-
llvm/lib/Transforms/Vectorize/VPlanValue.h | 5 +-
.../Transforms/Vectorize/VPlanVerifier.cpp | 3 +-
.../RISCV/interleaved-masked-access.ll | 90 ++++-----
.../RISCV/tail-folding-interleave.ll | 22 +--
9 files changed, 364 insertions(+), 121 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 13058a159cf95..02a0d1b7a1cde 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4228,6 +4228,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenIntOrFpInductionSC:
case VPDef::VPWidenPointerInductionSC:
case VPDef::VPReductionPHISC:
+ case VPDef::VPInterleaveEVLSC:
case VPDef::VPInterleaveSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
@@ -4256,8 +4257,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
// If no def nor is a store, e.g., branches, continue - no value to check.
if (R.getNumDefinedValues() == 0 &&
- !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
- &R))
+ !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(&R))
continue;
// For multi-def recipes, currently only interleaved loads, suffice to
// check first def only.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c13fe4548ff11..382f375009d36 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -557,6 +557,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPPartialReductionSC:
return true;
case VPRecipeBase::VPBranchOnMaskSC:
+ case VPRecipeBase::VPInterleaveEVLSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
case VPRecipeBase::VPWidenLoadEVLSC:
@@ -2371,11 +2372,14 @@ class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe {
}
};
-/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
-/// or stores into one wide load/store and shuffles. The first operand of a
-/// VPInterleave recipe is the address, followed by the stored values, followed
-/// by an optional mask.
-class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase {
+/// A common base class for interleaved memory operations.
+/// Interleaved memory operation is a memory access method that combines
+/// multiple strided loads/stores into a single wide load/store with shuffles.
+/// The first operand must be the address. The optional operands are, in order,
+/// the stored values and the mask.
+/// TODO: Inherit from VPIRMetadata
+class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase {
+protected:
const InterleaveGroup<Instruction> *IG;
/// Indicates if the interleave group is in a conditional block and requires a
@@ -2386,90 +2390,186 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase {
/// unusued gaps can be loaded speculatively.
bool NeedsMaskForGaps = false;
-public:
- VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
- ArrayRef<VPValue *> StoredValues, VPValue *Mask,
- bool NeedsMaskForGaps, DebugLoc DL)
- : VPRecipeBase(VPDef::VPInterleaveSC, {Addr},
- DL),
-
- IG(IG), NeedsMaskForGaps(NeedsMaskForGaps) {
+ VPInterleaveBase(const unsigned char SC,
+ const InterleaveGroup<Instruction> *IG,
+ ArrayRef<VPValue *> Operands,
+ ArrayRef<VPValue *> StoredValues, VPValue *Mask,
+ bool NeedsMaskForGaps, DebugLoc DL)
+ : VPRecipeBase(SC, Operands, DL), IG(IG),
+ NeedsMaskForGaps(NeedsMaskForGaps) {
// TODO: extend the masked interleaved-group support to reversed access.
assert((!Mask || !IG->isReverse()) &&
"Reversed masked interleave-group not supported.");
- for (unsigned i = 0; i < IG->getFactor(); ++i)
- if (Instruction *I = IG->getMember(i)) {
- if (I->getType()->isVoidTy())
+ for (unsigned I = 0; I < IG->getFactor(); ++I)
+ if (Instruction *Inst = IG->getMember(I)) {
+ if (Inst->getType()->isVoidTy())
continue;
- new VPValue(I, this);
+ new VPValue(Inst, this);
}
for (auto *SV : StoredValues)
addOperand(SV);
+
if (Mask) {
HasMask = true;
addOperand(Mask);
}
}
- ~VPInterleaveRecipe() override = default;
- VPInterleaveRecipe *clone() override {
- return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(),
- NeedsMaskForGaps, getDebugLoc());
+public:
+ VPInterleaveBase *clone() override {
+ llvm_unreachable("cloning not supported");
}
- VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPRecipeBase::VPInterleaveSC ||
+ R->getVPDefID() == VPRecipeBase::VPInterleaveEVLSC;
+ }
+
+ static inline bool classof(const VPUser *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && classof(R);
+ }
/// Return the address accessed by this recipe.
VPValue *getAddr() const {
return getOperand(0); // Address is the 1st, mandatory operand.
}
+ /// Return true if the access needs a mask because of the gaps.
+ bool needsMaskForGaps() const { return NeedsMaskForGaps; }
+
/// Return the mask used by this recipe. Note that a full mask is represented
/// by a nullptr.
VPValue *getMask() const {
- // Mask is optional and therefore the last, currently 2nd operand.
+ // Mask is optional and the last operand.
return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
}
+ const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
+
+ Instruction *getInsertPos() const { return IG->getInsertPos(); }
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPInterleaveBase should not be instantiated.");
+ }
+
+ /// Return the cost of this VPInterleaveRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ virtual bool onlyFirstLaneUsed(const VPValue *Op) const = 0;
+
+ /// Returns the number of stored operands of this interleave group. Returns 0
+ /// for load interleave groups.
+ virtual unsigned getNumStoreOperands() const = 0;
+
/// Return the VPValues stored by this interleave group. If it is a load
/// interleave group, return an empty ArrayRef.
- ArrayRef<VPValue *> getStoredValues() const {
- // The first operand is the address, followed by the stored values, followed
- // by an optional mask.
- return ArrayRef<VPValue *>(op_begin(), getNumOperands())
- .slice(1, getNumStoreOperands());
+ virtual ArrayRef<VPValue *> getStoredValues() const = 0;
+};
+
+/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
+/// or stores into one wide load/store and shuffles. The first operand of a
+/// VPInterleave recipe is the address, followed by the stored values, followed
+/// by an optional mask.
+class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase {
+public:
+ VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
+ ArrayRef<VPValue *> StoredValues, VPValue *Mask,
+ bool NeedsMaskForGaps, DebugLoc DL)
+ : VPInterleaveBase(VPDef::VPInterleaveSC, IG, ArrayRef<VPValue *>({Addr}),
+ StoredValues, Mask, NeedsMaskForGaps, DL) {}
+
+ ~VPInterleaveRecipe() override = default;
+
+ VPInterleaveRecipe *clone() override {
+ return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(),
+ NeedsMaskForGaps, getDebugLoc());
}
+ VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
+
/// Generate the wide load or store, and shuffles.
void execute(VPTransformState &State) override;
- /// Return the cost of this VPInterleaveRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
- const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
+ }
- /// Returns the number of stored operands of this interleave group. Returns 0
- /// for load interleave groups.
- unsigned getNumStoreOperands() const {
+ unsigned getNumStoreOperands() const override {
return getNumOperands() - (HasMask ? 2 : 1);
}
- /// The recipe only uses the first lane of the address.
+ ArrayRef<VPValue *> getStoredValues() const override {
+ // The first operand is the address, followed by the stored values, followed
+ // by an optional mask.
+ return ArrayRef<VPValue *>(op_begin(), getNumOperands())
+ .slice(1, getNumStoreOperands());
+ }
+};
+
+/// A recipe for interleaved access operations with vector-predication
+/// intrinsics. The first operand is the address, the second operand is the
+/// explicit vector length . Stored values and mask are optional operands.
+class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase {
+public:
+ VPInterleaveEVLRecipe(VPInterleaveRecipe &R, VPValue &EVL, VPValue *Mask,
+ DebugLoc DL = {})
+ : VPInterleaveBase(VPDef::VPInterleaveEVLSC, R.getInterleaveGroup(),
+ ArrayRef<VPValue *>({R.getAddr(), &EVL}),
+ R.getStoredValues(), Mask, R.needsMaskForGaps(), DL) {
+ assert(!IG->isReverse() &&
+ "Reversed interleave-group with tail folding is not supported.");
+ }
+
+ ~VPInterleaveEVLRecipe() override = default;
+
+ VPInterleaveEVLRecipe *clone() override {
+ llvm_unreachable("cloning not implemented yet");
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPInterleaveEVLSC)
+
+ /// The VPValue of the explicit vector length.
+ VPValue *getEVL() const { return getOperand(1); }
+
+ /// Generate the wide load or store, and shuffles.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// The recipe only uses the first lane of the address, and EVL operand.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
- return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
+ return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op) ||
+ Op == getEVL();
}
- Instruction *getInsertPos() const { return IG->getInsertPos(); }
+ unsigned getNumStoreOperands() const override {
+ return getNumOperands() - (HasMask ? 3 : 2);
+ }
+
+ ArrayRef<VPValue *> getStoredValues() const override {
+ // The first operand is the address, and the second operand is EVL, followed
+ // by the stored values, followe by an optional mask.
+ return ArrayRef<VPValue *>(op_begin(), getNumOperands())
+ .slice(2, getNumStoreOperands());
+ }
};
/// A recipe to represent inloop reduction operations, performing a reduction on
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 16072f268a98c..db541bc6e53a1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -295,7 +295,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
- .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
+ .Case<VPInterleaveRecipe, VPInterleaveEVLRecipe>([V](const auto *R) {
// TODO: Use info from interleave group.
return V->getUnderlyingValue()->getType();
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 98d11f0bc7893..74e894759bab8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -53,8 +53,9 @@ bool VPRecipeBase::mayWriteToMemory() const {
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
+ case VPInterleaveEVLSC:
case VPInterleaveSC:
- return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
+ return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
case VPWidenStoreEVLSC:
case VPWidenStoreSC:
return true;
@@ -108,6 +109,9 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
return true;
+ case VPInterleaveEVLSC:
+ case VPInterleaveSC:
+ return cast<VPInterleaveBase>(this)->getNumStoreOperands() == 0;
case VPReplicateSC:
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
->mayReadFromMemory();
@@ -184,6 +188,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
"underlying instruction has side-effects");
return false;
}
+ case VPInterleaveEVLSC:
case VPInterleaveSC:
return mayWriteToMemory();
case VPWidenLoadEVLSC:
@@ -256,7 +261,7 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
Instruction *UI = nullptr;
if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
- else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
+ else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
UI = IG->getInsertPos();
else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
UI = &WidenMem->getIngredient();
@@ -2091,7 +2096,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
if (VF.isScalar())
return TTI::CastContextHint::Normal;
- if (isa<VPInterleaveRecipe>(R))
+ if (isa<VPInterleaveBase>(R))
return TTI::CastContextHint::Interleave;
if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
@@ -3627,8 +3632,155 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
+void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
+ assert(!State.Lane && "Interleave group being replicated.");
+ assert(State.VF.isScalable() &&
+ "Only support scalable VF for EVL tail-folding.");
+ assert(!NeedsMaskForGaps &&
+ "Masking gaps for scalable vectors is not yet supported.");
+ const InterleaveGroup<Instruction> *Group = IG;
+ Instruction *Instr = Group->getInsertPos();
+
+ // Prepare for the vector type of the interleaved load/store.
+ Type *ScalarTy = getLoadStoreType(Instr);
+ unsigned InterleaveFactor = Group->getFactor();
+ assert(InterleaveFactor <= 8 &&
+ "Unsupported deinterleave/interleave factor for scalable vectors");
+ ElementCount WideVF = State.VF * InterleaveFactor;
+ auto *VecTy = VectorType::get(ScalarTy, WideVF);
+
+ VPValue *BlockInMask = getMask();
+ VPValue *Addr = getAddr();
+ Value *ResAddr = State.get(Addr, VPLane(0));
+ Value *EVL = State.get(getEVL(), VPLane(0));
+
+ auto CreateGroupMask = [&BlockInMask, &State,
+ &InterleaveFactor]() -> Value * {
+ auto *ResBlockInMask = State.get(BlockInMask);
+ SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
+ return interleaveVectors(State.Builder, Ops, "interleaved.mask");
+ };
+
+ Value *GroupMask = nullptr;
+ if (BlockInMask)
+ GroupMask = CreateGroupMask();
+ else
+ GroupMask =
+ State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
+
+ const DataLayout &DL = Instr->getDataLayout();
+ // Vectorize the interleaved load group.
+ if (isa<LoadInst>(Instr)) {
+ CallInst *NewLoad = State.Builder.CreateIntrinsic(VecTy, Intrinsic::vp_load,
+ {ResAddr, GroupMask, EVL},
+ nullptr, "wide.vp.load");
+ NewLoad->addParamAttr(0, Attribute::getWithAlignment(NewLoad->getContext(),
+ Group->getAlign()));
+
+ Group->addMetadata(NewLoad);
+
+ ArrayRef<VPValue *> VPDefs = definedValues();
+ // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+ // so must use intrinsics to deinterleave.
+ NewLoad = State.Builder.CreateIntrinsic(
+ Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
+ NewLoad->getType(), NewLoad,
+ /*FMFSource=*/nullptr, "strided.vec");
+
+ for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
+
+ // Skip the gaps in the group.
+ if (!Member)
+ continue;
+
+ Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
+
+ // If this member has different type, cast the result type.
+ if (Member->getType() != ScalarTy) {
+ VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
+ StridedVec =
+ createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
+ }
+
+ State.set(VPDefs[J], StridedVec);
+ ++J;
+ }
+ return;
+ }
+
+ // The sub vector type for current instruction.
+ auto *SubVT = VectorType::get(ScalarTy, State.VF);
+
+ // Vectorize the interleaved store group.
+ ArrayRef<VPValue *> StoredValues = getStoredValues();
+ // Collect the stored vector from each member.
+ SmallVector<Value *, 4> StoredVecs;
+ unsigned StoredIdx = 0;
+ for (unsigned I = 0; I < InterleaveFactor; I++) {
+ Instruction *Member = Group->getMember(I);
+
+ // Skip the gaps in the group.
+ if (!Member) {
+ Value *Undef = PoisonValue::get(SubVT);
+ StoredVecs.push_back(Undef);
+ continue;
+ }
+
+ Value *StoredVec = State.get(StoredValues[StoredIdx]);
+ ++StoredIdx;
+
+ // If this member has different type, cast it to a unified type.
+ if (StoredVec->getType() != SubVT)
+ StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
+
+ StoredVecs.push_back(StoredVec);
+ }
+
+ // Interleave all the smaller vectors into one wider vector.
+ Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
+ CallInst *NewStore = State.Builder.CreateIntrinsic(
+ Type::getVoidTy(EVL->getContext()), Intrinsic::vp_store,
+ {IVec, ResAddr, GroupMask, EVL});
+ NewStore->addParamAttr(1, Attribute::getWithAlignment(NewStore->getContext(),
+ Group->getAlign()));
+
+ Group->addMetadata(NewStore);
+}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+ IG->getInsertPos()->printAsOperand(O, false);
+ O << ", ";
+ getAddr()->printAsOperand(O, SlotTracker);
+ O << ", ";
+ getEVL()->printAsOperand(O, SlotTracker);
+ if (VPValue *Mask = getMask()) {
+ O << ", ";
+ Mask->printAsOperand(O, SlotTracker);
+ }
+
+ unsigned OpIdx = 0;
+ for (unsigned i = 0; i < IG->getFactor(); ++i) {
+ if (!IG->getMember(i))
+ continue;
+ if (getNumStoreOperands() > 0) {
+ O << "\n" << Indent << " vp.store ";
+ getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
+ O << " to index " << i;
+ } else {
+ O << "\n" << Indent << " ";
+ getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
+ O << " = vp.load from index " << i;
+ }
+ ++OpIdx;
+ }
+}
+#endif
+
+InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
Instruction *InsertPos = getInsertPos();
// Find the VPValue index of the interleave group. We need to skip gaps.
unsigned InsertPosIdx = 0;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 98c6b9a70405b..a011a72367dd2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2142,6 +2142,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPValue *NewMask = GetNewMask(S->getMask());
return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
})
+ .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
+ VPValue *NewMask = GetNewMask(IR->getMask());
+ return new VPInterleaveEVLRecipe(*IR, EVL, NewMask, IR->getDebugLoc());
+ })
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
VPValue *NewMask = GetNewMask(Red->getCondOp());
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
@@ -2233,17 +2237,18 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
if (!EVLRecipe)
continue;
- [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
+ unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
"New recipe must define the same number of values as the "
"original.");
- assert(
- NumDefVal <= 1 &&
- "Only supports recipes with a single definition or without users.");
+
EVLRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
- VPValue *CurVPV = CurRecipe->getVPSingleValue();
- CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPInterleaveEVLRecipe>(
+ EVLRecipe)) {
+ for (unsigned I = 0; I < NumDefVal; ++I) {
+ VPValue *CurVPV = CurRecipe->getVPValue(I);
+ CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
+ }
}
ToErase.push_back(CurRecipe);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 24f6d61512ef6..85c6c2c8d7965 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -38,7 +38,7 @@ struct VPDoubleValueDef;
class VPSlotTracker;
class VPUser;
class VPRecipeBase;
-class VPInterleaveRecipe;
+class VPInterleaveBase;
class VPPhiAccessors;
// This is the base class of the VPlan Def/Use graph, used for modeling the data
@@ -48,7 +48,7 @@ class VPPhiAccessors;
class LLVM_ABI_FOR_TEST VPValue {
friend class VPDef;
friend struct VPDoubleValueDef;
- friend class VPInterleaveRecipe;
+ friend class VPInterleaveBase;
friend class VPlan;
friend class VPExpressionRecipe;
@@ -335,6 +335,7 @@ class VPDef {
VPExpressionSC,
VPIRInstructionSC,
VPInstructionSC,
+ VPInterleaveEVLSC,
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 14ae4f2204310..0ca8f7d2f2807 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -166,7 +166,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
}
return VerifyEVLUse(*R, 2);
})
- .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>(
+ .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe,
+ VPInterleaveEVLRecipe>(
[&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
.Case<VPInstructionWithType>(
[&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index 976ce77d2ba29..144972bc692cd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -117,34 +117,29 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = phi i32 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
-; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = icmp ult <vscale x 16 x i32> [[TMP2]], [[BROADCAST_SPLAT4]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = select <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = shl i32 [[EVL_BASED_IV]], 1
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP8]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
-; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = sext i32 [[TMP6]] to i64
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP12]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP11]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP14]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK5:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]])
-; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP13]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl i32 [[EVL_BASED_IV]], 1
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_VP_LOAD:%.*]] = call <vscale x 32 x i8> @llvm.vp.load.nxv32i8.p0(ptr align 1 [[TMP5]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], i32 [[TMP1]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_VP_LOAD]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = sext i32 [[TMP3]] to i64
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP9]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP8]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP11]])
+; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP10]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]], i32 [[TMP1]])
; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]]
; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024
-; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024
+; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; PREDICATED_DATA-WITH-EVL: middle.block:
; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]]
; PREDICATED_DATA-WITH-EVL: scalar.ph:
@@ -304,38 +299,33 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = phi i32 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
-; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = icmp ult <vscale x 16 x i32> [[TMP2]], [[BROADCAST_SPLAT4]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = select <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = shl i32 [[EVL_BASED_IV]], 2
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP8]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
-; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP15]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = sext i32 [[TMP6]] to i64
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP17]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
-; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK5:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]])
-; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl i32 [[EVL_BASED_IV]], 2
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_VP_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr align 1 [[TMP5]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], i32 [[TMP1]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_VP_LOAD]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP10]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP12]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sext i32 [[TMP3]] to i64
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP2]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]])
+; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP15]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]], i32 [[TMP1]])
; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]]
; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024
-; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024
+; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; PREDICATED_DATA-WITH-EVL: middle.block:
; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]]
; PREDICATED_DATA-WITH-EVL: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index 8d987a94d383d..9b88df004f45c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -7,7 +7,6 @@
; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
-; FIXME: interleaved accesses are not supported yet with predicated vectorization.
define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-LABEL: @interleave(
; IF-EVL-NEXT: entry:
@@ -25,25 +24,20 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ult <vscale x 4 x i32> [[TMP16]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[EVL_BASED_IV]], i32 0
-; IF-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[TMP17]])
-; IF-EVL-NEXT: [[WIDE_VEC:%.*]] = call <vscale x 8 x i32> @llvm.masked.load.nxv8i32.p0(ptr [[TMP6]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x i32> poison)
+; IF-EVL-NEXT: [[WIDE_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP6]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP16]])
; IF-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; IF-EVL-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; IF-EVL-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; IF-EVL-NEXT: [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[TMP15]], [[TMP14]]
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
-; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
-; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP16]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
; IF-EVL: scalar.ph:
More information about the llvm-commits
mailing list