[llvm] [VPlan] Extract reverse mask from reverse accesses (PR #155579)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 31 01:18:28 PDT 2026
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/155579
>From 5bda9dfce4c22f7d75db66a2fb1342d1078e1aa3 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 14 Aug 2025 00:41:44 -0700
Subject: [PATCH 1/9] Also extract reverse mask
---
.../Transforms/Vectorize/LoopVectorize.cpp | 46 ++++++------
llvm/lib/Transforms/Vectorize/VPlan.h | 39 +++-------
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 73 ++++++++-----------
.../Transforms/Vectorize/VPlanTransforms.cpp | 58 ++++++++-------
.../AArch64/sve-vector-reverse-mask4.ll | 3 +-
.../AArch64/vector-reverse-mask4.ll | 2 +-
.../RISCV/dbg-tail-folding-by-evl.ll | 6 +-
.../RISCV/predicated-reverse-store.ll | 4 +-
.../RISCV/riscv-vector-reverse.ll | 16 ++--
.../RISCV/tail-folding-reverse-load-store.ll | 14 ++--
.../RISCV/tail-folding-uniform-store.ll | 2 +-
.../VPlan/RISCV/vplan-riscv-vector-reverse.ll | 2 +-
.../LoopVectorize/X86/masked_load_store.ll | 12 +--
.../Transforms/Vectorize/VPlanTest.cpp | 14 ++--
14 files changed, 132 insertions(+), 159 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ac94d0dbcc4cd..565c8ab7b0484 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7137,10 +7137,19 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
return nullptr;
};
- // Check if a select for a safe divisor was hoisted to the pre-header. If so,
- // the select doesn't need to be considered for the vector loop cost; go with
- // the more accurate VPlan-based cost model.
for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
+ using namespace VPlanPatternMatch;
+ // Reverse operations for reverse memory accesses may be hoisted to the
+ // preheader by LICM if the reversed value is loop invariant. In this case,
+ // the VPlan-based cost model diverges from the legacy cost model.
+ if (match(&R,
+ m_CombineOr(m_Reverse(m_VPValue()),
+ m_Intrinsic<Intrinsic::experimental_vp_reverse>())))
+ return true;
+
+ // Check if a select for a safe divisor was hoisted to the pre-header. If
+ // so, the select doesn't need to be considered for the vector loop cost; go
+ // with the more accurate VPlan-based cost model.
auto *VPI = dyn_cast<VPInstruction>(&R);
if (!VPI || VPI->getOpcode() != Instruction::Select)
continue;
@@ -7171,6 +7180,7 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
}
continue;
}
+ using namespace VPlanPatternMatch;
// Unused FOR splices are removed by VPlan transforms, so the VPlan-based
// cost model won't cost it whilst the legacy will.
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
@@ -7193,20 +7203,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
return true;
-
- if (WidenMemR->isReverse()) {
- // If the stored value of a reverse store is invariant, LICM will
- // hoist the reverse operation to the preheader. In this case, the
- // result of the VPlan-based cost model will diverge from that of
- // the legacy model.
- if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(WidenMemR))
- if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
- return true;
-
- if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(WidenMemR))
- if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
- return true;
- }
}
// The legacy cost model costs non-header phis with a scalar VF as a phi,
@@ -7229,6 +7225,11 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
RepR->getUnderlyingInstr(), VF))
return true;
}
+
+ // vp.reverse is generated during EVL tail folding transformation.
+ if (match(&R, m_Intrinsic<Intrinsic::experimental_vp_reverse>()))
+ return true;
+
if (Instruction *UI = GetInstructionForCost(&R)) {
// If we adjusted the predicate of the recipe, the cost in the legacy
// cost model may be different.
@@ -7754,10 +7755,13 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
Ptr = VectorPtr;
}
+ if (Reverse && Mask)
+ Mask = Builder.createNaryOp(VPInstruction::Reverse, Mask, I->getDebugLoc());
+
if (VPI->getOpcode() == Instruction::Load) {
auto *Load = cast<LoadInst>(I);
- auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- *VPI, Load->getDebugLoc());
+ auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, *VPI,
+ Load->getDebugLoc());
if (Reverse) {
Builder.insert(LoadR);
return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
@@ -7771,8 +7775,8 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
if (Reverse)
StoredVal = Builder.createNaryOp(VPInstruction::Reverse, StoredVal,
Store->getDebugLoc());
- return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
- Reverse, *VPI, Store->getDebugLoc());
+ return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive, *VPI,
+ Store->getDebugLoc());
}
VPWidenIntOrFpInductionRecipe *
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ab47d927942db..5bac173262468 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3535,9 +3535,6 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
/// Whether the accessed addresses are consecutive.
bool Consecutive;
- /// Whether the consecutive accessed addresses are in reverse order.
- bool Reverse;
-
/// Whether the memory access is masked.
bool IsMasked = false;
@@ -3551,15 +3548,10 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
- bool Consecutive, bool Reverse,
- const VPIRMetadata &Metadata, DebugLoc DL)
+ bool Consecutive, const VPIRMetadata &Metadata,
+ DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
- Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive),
- Reverse(Reverse) {
- assert((Consecutive || !Reverse) && "Reverse implies consecutive");
- assert((isa<VPVectorEndPointerRecipe>(getAddr()) || !Reverse) &&
- "Reversed acccess without VPVectorEndPointerRecipe address?");
- }
+ Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive) {}
public:
VPWidenMemoryRecipe *clone() override {
@@ -3581,10 +3573,6 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
/// Return whether the loaded-from / stored-to addresses are consecutive.
bool isConsecutive() const { return Consecutive; }
- /// Return whether the consecutive loaded/stored addresses are in reverse
- /// order.
- bool isReverse() const { return Reverse; }
-
/// Return the address accessed by this recipe.
VPValue *getAddr() const { return getOperand(0); }
@@ -3618,18 +3606,16 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
public VPRecipeValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
- bool Consecutive, bool Reverse,
- const VPIRMetadata &Metadata, DebugLoc DL)
+ bool Consecutive, const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPRecipeBase::VPWidenLoadSC, Load, {Addr},
- Consecutive, Reverse, Metadata, DL),
+ Consecutive, Metadata, DL),
VPRecipeValue(this, &Load) {
setMask(Mask);
}
VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getMask(), Consecutive, Reverse, *this,
- getDebugLoc());
+ getMask(), Consecutive, *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPRecipeBase::VPWidenLoadSC);
@@ -3662,7 +3648,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe,
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
VPValue *Mask)
: VPWidenMemoryRecipe(VPRecipeBase::VPWidenLoadEVLSC, L.getIngredient(),
- {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
+ {Addr, &EVL}, L.isConsecutive(), L,
L.getDebugLoc()),
VPRecipeValue(this, &getIngredient()) {
setMask(Mask);
@@ -3701,18 +3687,17 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe,
/// to store to and an optional mask.
struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
- VPValue *Mask, bool Consecutive, bool Reverse,
+ VPValue *Mask, bool Consecutive,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPRecipeBase::VPWidenStoreSC, Store,
- {Addr, StoredVal}, Consecutive, Reverse, Metadata,
- DL) {
+ {Addr, StoredVal}, Consecutive, Metadata, DL) {
setMask(Mask);
}
VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
- Reverse, *this, getDebugLoc());
+ *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPRecipeBase::VPWidenStoreSC);
@@ -3747,8 +3732,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr,
VPValue *StoredVal, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPRecipeBase::VPWidenStoreEVLSC, S.getIngredient(),
- {Addr, StoredVal, &EVL}, S.isConsecutive(),
- S.isReverse(), S, S.getDebugLoc()) {
+ {Addr, StoredVal, &EVL}, S.isConsecutive(), S,
+ S.getDebugLoc()) {
setMask(Mask);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7eefd77045050..0944c29763769 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1029,8 +1029,6 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
return TTI::CastContextHint::Normal;
if (!WidenMemoryRecipe->isConsecutive())
return TTI::CastContextHint::GatherScatter;
- if (WidenMemoryRecipe->isReverse())
- return TTI::CastContextHint::Reversed;
if (WidenMemoryRecipe->isMasked())
return TTI::CastContextHint::Masked;
return TTI::CastContextHint::Normal;
@@ -1038,6 +1036,7 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
VPValue *Operand = getOperand(0);
TTI::CastContextHint CCH = TTI::CastContextHint::None;
+ bool IsReverse = false;
// For Trunc/FPTrunc, get the context from the only user.
if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
@@ -1046,8 +1045,10 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
return dyn_cast<VPRecipeBase>(*R->user_begin());
};
if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
- if (match(Recipe, m_Reverse(m_VPValue())))
+ if (match(Recipe, m_Reverse(m_VPValue()))) {
Recipe = GetOnlyUser(cast<VPInstruction>(Recipe));
+ IsReverse = true;
+ }
if (Recipe)
CCH = ComputeCCH(Recipe);
}
@@ -1057,12 +1058,16 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
Opcode == Instruction::FPExt) {
if (auto *Recipe = Operand->getDefiningRecipe()) {
VPValue *ReverseOp;
- if (match(Recipe, m_Reverse(m_VPValue(ReverseOp))))
+ if (match(Recipe, m_Reverse(m_VPValue(ReverseOp)))) {
Recipe = ReverseOp->getDefiningRecipe();
+ IsReverse = true;
+ }
if (Recipe)
CCH = ComputeCCH(Recipe);
}
}
+ if (IsReverse && CCH != TTI::CastContextHint::None)
+ CCH = TTI::CastContextHint::Reversed;
auto *ScalarSrcTy = Ctx.Types.inferScalarType(Operand);
Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
@@ -1244,8 +1249,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
case VPInstruction::Reverse: {
assert(VF.isVector() && "Reverse operation must be vector type");
- auto *VectorTy = cast<VectorType>(
- toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
+ Type *EltTy = Ctx.Types.inferScalarType(getOperand(0));
+ // Skip the reverse operation cost for the mask.
+ // FIXME: Remove this once redundant mask reverse operations can be
+ // eliminated by VPlanTransforms::cse before cost computation.
+ if (EltTy->isIntegerTy(1))
+ return 0;
+ auto *VectorTy = cast<VectorType>(toVectorTy(EltTy, VF));
return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
VectorTy, /*Mask=*/{}, Ctx.CostKind,
/*Index=*/0);
@@ -1933,6 +1943,13 @@ static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
const VPRecipeWithIRFlags &R,
ElementCount VF,
VPCostContext &Ctx) {
+ Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
+ // Skip the reverse operation cost for the mask.
+ // FIXME: Remove this once redundant mask reverse operations can be eliminated
+ // by VPlanTransforms::cse before cost computation.
+ if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(1))
+ return InstructionCost(0);
+
// Some backends analyze intrinsic arguments to determine cost. Use the
// underlying value for the operand if it has one. Otherwise try to use the
// operand of the underlying call instruction, if there is one. Otherwise
@@ -1952,7 +1969,6 @@ static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
Arguments.push_back(V);
}
- Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
SmallVector<Type *> ParamTys;
for (const VPValue *Op : Operands) {
@@ -3801,9 +3817,6 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
// TODO: Using the original IR may not be accurate.
// Currently, ARM will use the underlying IR to calculate gather/scatter
// instruction cost.
- assert(!Reverse &&
- "Inconsecutive memory access should not have the order.");
-
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
Type *PtrTy = Ptr->getType();
@@ -3847,13 +3860,8 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
Value *Mask = nullptr;
- if (auto *VPMask = getMask()) {
- // Mask reversal is only needed for non-all-one (null) masks, as reverse
- // of a null all-one mask is a null mask.
+ if (auto *VPMask = getMask())
Mask = State.get(VPMask);
- if (isReverse())
- Mask = Builder.CreateVectorReverse(Mask, "reverse");
- }
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
Value *NewLI;
@@ -3881,17 +3889,6 @@ void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
}
#endif
-/// Use all-true mask for reverse rather than actual mask, as it avoids a
-/// dependence w/o affecting the result.
-static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
- Value *EVL, const Twine &Name) {
- VectorType *ValTy = cast<VectorType>(Operand->getType());
- Value *AllTrueMask =
- Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
- return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
- {Operand, AllTrueMask, EVL}, nullptr, Name);
-}
-
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
@@ -3902,13 +3899,10 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Value *EVL = State.get(getEVL(), VPLane(0));
Value *Addr = State.get(getAddr(), !CreateGather);
Value *Mask = nullptr;
- if (VPValue *VPMask = getMask()) {
+ if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
- if (isReverse())
- Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
- } else {
+ else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
- }
if (CreateGather) {
NewLI =
@@ -3960,13 +3954,8 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
Value *Mask = nullptr;
- if (auto *VPMask = getMask()) {
- // Mask reversal is only needed for non-all-one (null) masks, as reverse
- // of a null all-one mask is a null mask.
+ if (auto *VPMask = getMask())
Mask = State.get(VPMask);
- if (isReverse())
- Mask = Builder.CreateVectorReverse(Mask, "reverse");
- }
Value *StoredVal = State.get(StoredVPValue);
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
@@ -3998,13 +3987,11 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
Value *Mask = nullptr;
- if (VPValue *VPMask = getMask()) {
+ if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
- if (isReverse())
- Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
- } else {
+ else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
- }
+
Value *Addr = State.get(getAddr(), !CreateScatter);
if (CreateScatter) {
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f779aa92a3aa7..9b73a14c56a6a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -80,12 +80,11 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenLoadRecipe(
*Load, Ingredient.getOperand(0), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/, *VPI,
- Ingredient.getDebugLoc());
+ false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenStoreRecipe(
*Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
- nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
+ nullptr /*Mask*/, false /*Consecutive*/, *VPI,
Ingredient.getDebugLoc());
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
@@ -1785,8 +1784,6 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
!WidenStoreR->isConsecutive()) {
- assert(!WidenStoreR->isReverse() &&
- "Not consecutive memory recipes shouldn't be reversed");
VPValue *Mask = WidenStoreR->getMask();
// Only convert the scatter to a scalar store if it is unmasked.
@@ -3059,18 +3056,28 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return EVLEndPtr;
};
+ auto GetVPReverse = [&CurRecipe, &EVL, &TypeInfo, Plan,
+ DL](VPValue *V) -> VPWidenIntrinsicRecipe * {
+ auto *Reverse = new VPWidenIntrinsicRecipe(
+ Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
+ TypeInfo.inferScalarType(V), {}, {}, DL);
+ Reverse->insertBefore(&CurRecipe);
+ return Reverse;
+ };
+
if (match(&CurRecipe,
- m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
- !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
+ m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
EVL, Mask);
VPValue *ReversedVal;
if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
match(ReversedVal,
- m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
- match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
- cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
+ m_MaskedLoad(m_VPValue(EndPtr),
+ m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
+ match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF())))) {
+ if (Mask)
+ Mask = GetVPReverse(Mask);
auto *LoadR = new VPWidenLoadEVLRecipe(
*cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
LoadR->insertBefore(&CurRecipe);
@@ -3081,24 +3088,19 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPValue *StoredVal;
if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
- m_RemoveMask(HeaderMask, Mask))) &&
- !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
+ m_RemoveMask(HeaderMask, Mask))))
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
StoredVal, EVL, Mask);
if (match(&CurRecipe,
m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
- m_RemoveMask(HeaderMask, Mask))) &&
- match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
- cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
- auto *NewReverse = new VPWidenIntrinsicRecipe(
- Intrinsic::experimental_vp_reverse,
- {ReversedVal, Plan->getTrue(), &EVL},
- TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
- NewReverse->insertBefore(&CurRecipe);
+ m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
+ match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF())))) {
+ if (Mask)
+ Mask = GetVPReverse(Mask);
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
- AdjustEndPtr(EndPtr), NewReverse, EVL,
- Mask);
+ AdjustEndPtr(EndPtr),
+ GetVPReverse(ReversedVal), EVL, Mask);
}
if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
@@ -5367,9 +5369,9 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
// Narrow interleave group to wide load, as transformed VPlan will only
// process one original iteration.
auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
- auto *L = new VPWidenLoadRecipe(
- *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
- /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
+ auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
+ LoadGroup->getMask(), /*Consecutive=*/true,
+ {}, LoadGroup->getDebugLoc());
L->insertBefore(LoadGroup);
NarrowedOps.insert(L);
return L;
@@ -5524,9 +5526,9 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
auto *SI =
cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
- auto *S = new VPWidenStoreRecipe(
- *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
- /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
+ auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
+ /*Consecutive=*/true, {},
+ StoreGroup->getDebugLoc());
S->insertBefore(StoreGroup);
StoreGroup->eraseFromParent();
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
index 7f3b28422e47b..7c9854fe30b17 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
@@ -23,8 +23,7 @@ define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 {
; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[REVERSE7]]
; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
-; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
-; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr align 8 %{{.*}}, <vscale x 4 x i1> %[[REVERSE9]]
+; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr align 8 %{{.*}}, <vscale x 4 x i1> %[[REVERSE6]])
entry:
%cmp7 = icmp sgt i64 %N, 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
index 4a4928c637a5c..39ae02fad4187 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
@@ -46,8 +46,8 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -24
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -56
; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP8]], <4 x i1> [[REVERSE3]], <4 x double> poison)
; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP8]], <4 x i1> [[REVERSE3]], <4 x double> poison)
; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP9]], <4 x i1> [[REVERSE5]], <4 x double> poison)
; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], splat (double 1.000000e+00)
; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD6]], splat (double 1.000000e+00)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dbg-tail-folding-by-evl.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dbg-tail-folding-by-evl.ll
index 1c06b4b77e4a1..1e7cbfca4b032 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dbg-tail-folding-by-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dbg-tail-folding-by-evl.ll
@@ -29,11 +29,11 @@ define void @reverse_store(ptr %a, i64 %n) !dbg !3 {
; CHECK-NEXT: [[TMP7:%.*]] = add nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 -1), !dbg [[DBG6:![0-9]+]]
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <vscale x 2 x i64> [[TMP7]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[A]], i64 [[TMP8]], !dbg [[DBG7:![0-9]+]]
-; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP4]]), !dbg [[DBG8:![0-9]+]]
; CHECK-NEXT: [[TMP11:%.*]] = sub nuw nsw i64 [[TMP5]], 1
; CHECK-NEXT: [[TMP12:%.*]] = sub i64 0, [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP9]], i64 [[TMP12]], !dbg [[DBG8]]
-; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP4]]), !dbg [[DBG8]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP9]], i64 [[TMP12]], !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT: [[TMP13:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP4]]), !dbg [[DBG8]]
+; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP13]], ptr align 8 [[TMP14]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP4]]), !dbg [[DBG8]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/predicated-reverse-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/predicated-reverse-store.ll
index 116a87795fa0f..9c96f44a9d19b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/predicated-reverse-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/predicated-reverse-store.ll
@@ -17,12 +17,12 @@ define void @reverse_predicated_store(i1 %c, ptr %dst, i64 %n) #0 {
; CHECK-NEXT: [[IV:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[IV_NEXT:%.*]] = add i64 [[IV]], -1
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[BROADCAST_SPLAT]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP1]])
; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-NEXT: [[TMP6:%.*]] = sub nuw nsw i64 [[TMP4]], 1
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 0, [[TMP6]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[ARRAYIDX]], i64 [[TMP7]]
-; CHECK-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[BROADCAST_SPLAT]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> splat (i1 true), i32 [[TMP1]])
; CHECK-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP12]], ptr align 4 [[TMP9]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP1]])
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP4]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 094500f07b418..6fb1bf0d191e9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -37,8 +37,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
-; RV64-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP5]]
+; RV64-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE1]], ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]]
; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
@@ -69,8 +69,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
-; RV32-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP16]], i32 [[TMP4]]
+; RV32-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE1]], ptr align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: [[TMP23:%.*]] = zext i32 [[TMP9]] to i64
; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP23]], [[INDEX]]
@@ -196,8 +196,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: [[TMP29:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
-; RV64-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[TMP30]], i64 [[TMP18]]
+; RV64-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP35]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP36]], [[INDEX]]
; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP36]]
@@ -250,8 +250,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: [[TMP22:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
-; RV32-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP23]], i32 [[TMP10]]
+; RV32-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP25]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: [[TMP29:%.*]] = zext i32 [[TMP16]] to i64
; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP29]], [[INDEX]]
@@ -421,8 +421,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: [[TMP29:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]]
-; RV64-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: [[TMP35:%.*]] = getelementptr float, ptr [[TMP30]], i64 [[TMP18]]
+; RV64-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE3]], ptr align 4 [[TMP35]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP36]], [[INDEX]]
; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP36]]
@@ -475,8 +475,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: [[TMP22:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
-; RV32-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[TMP23]], i32 [[TMP10]]
+; RV32-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE3]], ptr align 4 [[TMP25]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: [[TMP29:%.*]] = zext i32 [[TMP16]] to i64
; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP29]], [[INDEX]]
@@ -622,8 +622,8 @@ define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
-; RV64-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: [[TMP21:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[TMP5]]
+; RV64-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE1]], ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]]
; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
@@ -654,8 +654,8 @@ define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
-; RV32-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: [[TMP22:%.*]] = getelementptr float, ptr [[TMP16]], i32 [[TMP4]]
+; RV32-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE1]], ptr align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: [[TMP23:%.*]] = zext i32 [[TMP9]] to i64
; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP23]], [[INDEX]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
index f1542e403a870..2675b0cf11a29 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
@@ -27,8 +27,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP6]]
+; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]]
@@ -127,18 +127,17 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP15:%.*]] = sub nuw nsw i64 [[TMP26]], 1
; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 0, [[TMP15]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP13]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP26]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP26]]
; IF-EVL-NEXT: [[TMP29:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
@@ -180,8 +179,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; NO-VP-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP8]]
; NO-VP-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP14]]
; NO-VP-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE2]])
-; NO-VP-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE4]], ptr align 4 [[TMP22]], <vscale x 4 x i1> [[REVERSE3]])
+; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE4]], ptr align 4 [[TMP22]], <vscale x 4 x i1> [[REVERSE]])
; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; NO-VP-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NO-VP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -265,8 +263,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP10]]
-; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP10]]
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP11]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
index 871dac6b9a78b..b2e79f5033ee8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
@@ -23,11 +23,11 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP12:%.*]] = sub nuw nsw i64 1, [[OFFSET_IDX]]
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
-; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-NEXT: [[TMP6:%.*]] = sub nuw nsw i64 [[TMP10]], 1
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 0, [[TMP6]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP7]]
+; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[REVERSE]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll
index dc0570110f606..6fd57e3dd2da2 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll
@@ -37,8 +37,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: WIDEN-INTRINSIC vp<[[VAL_B:%.+]]> = call llvm.experimental.vp.reverse(ir<[[LOAD_B]]>, ir<true>, vp<[[EVL]]>)
; CHECK-NEXT: WIDEN ir<[[ADD_RESULT:%.+]]> = add vp<[[VAL_B]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ARRAY_IDX_A:%.+]]> = getelementptr inbounds ir<[[A:%.+]]>, ir<[[IDX_PROM]]>
-; CHECK-NEXT: WIDEN-INTRINSIC vp<[[STORE_VAL:%.+]]> = call llvm.experimental.vp.reverse(ir<[[ADD_RESULT]]>, ir<true>, vp<[[EVL]]>)
; CHECK-NEXT: vp<[[VEC_END_PTR_A:%.+]]> = vector-end-pointer ir<[[ARRAY_IDX_A]]>, vp<[[EVL]]>
+; CHECK-NEXT: WIDEN-INTRINSIC vp<[[STORE_VAL:%.+]]> = call llvm.experimental.vp.reverse(ir<[[ADD_RESULT]]>, ir<true>, vp<[[EVL]]>)
; CHECK-NEXT: WIDEN vp.store vp<[[VEC_END_PTR_A]]>, vp<[[STORE_VAL]]>, vp<[[EVL]]>
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[EVL]]>, vp<[[EVL_PHI]]>
; CHECK-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[EVL]]>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index e2b713e868fa7..672257bfda3f8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -1139,12 +1139,12 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
; AVX2-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP14]], i64 -11
; AVX2-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP14]], i64 -15
; AVX2-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP15]], <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META25:![0-9]+]]
; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP16]], <4 x i1> [[REVERSE13]], <4 x double> poison), !alias.scope [[META25]]
; AVX2-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP17]], <4 x i1> [[REVERSE15]], <4 x double> poison), !alias.scope [[META25]]
; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP15]], <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META25:![0-9]+]]
+; AVX2-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP16]], <4 x i1> [[REVERSE13]], <4 x double> poison), !alias.scope [[META25]]
+; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP17]], <4 x i1> [[REVERSE15]], <4 x double> poison), !alias.scope [[META25]]
; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP18]], <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META25]]
; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD14]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -1218,12 +1218,12 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
; AVX512-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP14]], i64 -23
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP14]], i64 -31
; AVX512-NEXT: [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP15]], <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META37:![0-9]+]]
; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP16]], <8 x i1> [[REVERSE13]], <8 x double> poison), !alias.scope [[META37]]
; AVX512-NEXT: [[REVERSE15:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP17]], <8 x i1> [[REVERSE15]], <8 x double> poison), !alias.scope [[META37]]
; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP15]], <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META37:![0-9]+]]
+; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP16]], <8 x i1> [[REVERSE13]], <8 x double> poison), !alias.scope [[META37]]
+; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP17]], <8 x i1> [[REVERSE15]], <8 x double> poison), !alias.scope [[META37]]
; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP18]], <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META37]]
; AVX512-NEXT: [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD14]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index c28631a5a3ddb..2bb3b5dc5b3e3 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1200,7 +1200,7 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPRecipeBase) {
new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
- VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {});
+ VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, {}, {});
checkVPRecipeCastImpl<VPWidenLoadRecipe, VPUser, VPIRMetadata>(&Recipe);
@@ -1232,7 +1232,7 @@ TEST_F(VPRecipeTest, CastVPWidenLoadEVLRecipeToVPUser) {
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 8));
- VPWidenLoadRecipe BaseLoad(*Load, Addr, Mask, true, false, {}, {});
+ VPWidenLoadRecipe BaseLoad(*Load, Addr, Mask, true, {}, {});
VPWidenLoadEVLRecipe Recipe(BaseLoad, Addr, *EVL, Mask);
checkVPRecipeCastImpl<VPWidenLoadEVLRecipe, VPUser, VPIRMetadata>(&Recipe);
@@ -1249,7 +1249,7 @@ TEST_F(VPRecipeTest, CastVPWidenStoreRecipeToVPUser) {
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *StoredVal = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 42));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
- VPWidenStoreRecipe Recipe(*Store, Addr, StoredVal, Mask, true, false, {}, {});
+ VPWidenStoreRecipe Recipe(*Store, Addr, StoredVal, Mask, true, {}, {});
checkVPRecipeCastImpl<VPWidenStoreRecipe, VPUser, VPIRMetadata>(&Recipe);
@@ -1266,8 +1266,7 @@ TEST_F(VPRecipeTest, CastVPWidenStoreEVLRecipeToVPUser) {
VPValue *StoredVal = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 42));
VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 8));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
- VPWidenStoreRecipe BaseStore(*Store, Addr, StoredVal, Mask, true, false, {},
- {});
+ VPWidenStoreRecipe BaseStore(*Store, Addr, StoredVal, Mask, true, {}, {});
VPWidenStoreEVLRecipe Recipe(BaseStore, Addr, StoredVal, *EVL, Mask);
checkVPRecipeCastImpl<VPWidenStoreEVLRecipe, VPUser, VPIRMetadata>(&Recipe);
@@ -1353,7 +1352,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
- VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {});
+ VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, {}, {});
EXPECT_FALSE(Recipe.mayHaveSideEffects());
EXPECT_TRUE(Recipe.mayReadFromMemory());
EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1367,8 +1366,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
- VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {},
- {});
+ VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, {}, {});
EXPECT_TRUE(Recipe.mayHaveSideEffects());
EXPECT_FALSE(Recipe.mayReadFromMemory());
EXPECT_TRUE(Recipe.mayWriteToMemory());
>From 6c55b7ef1b5ad22795ce0fc955e29bf77913850a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 20 Mar 2026 02:15:23 -0700
Subject: [PATCH 2/9] Remove experimental_vp_reverse rule from
planContainsAdditionalSimplifications
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ----
1 file changed, 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 565c8ab7b0484..540023926ca93 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7226,10 +7226,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
return true;
}
- // vp.reverse is generated during EVL tail folding transformation.
- if (match(&R, m_Intrinsic<Intrinsic::experimental_vp_reverse>()))
- return true;
-
if (Instruction *UI = GetInstructionForCost(&R)) {
// If we adjusted the predicate of the recipe, the cost in the legacy
// cost model may be different.
>From 46261dc0f640ba612008745e80448a4fe7dcc8f3 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 24 Mar 2026 00:51:52 -0700
Subject: [PATCH 3/9] remove blank
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 540023926ca93..60ab60f4a9222 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7225,7 +7225,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
RepR->getUnderlyingInstr(), VF))
return true;
}
-
if (Instruction *UI = GetInstructionForCost(&R)) {
// If we adjusted the predicate of the recipe, the cost in the legacy
// cost model may be different.
>From 12ee93480d82db5da97ad6b36844c32ef9b467d6 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 26 Mar 2026 00:15:36 -0700
Subject: [PATCH 4/9] Fix Unspecified Evaluation Order
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9b73a14c56a6a..7efb2bf3de328 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3075,11 +3075,12 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
match(ReversedVal,
m_MaskedLoad(m_VPValue(EndPtr),
m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
- match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF())))) {
+ match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
if (Mask)
Mask = GetVPReverse(Mask);
+ Addr = AdjustEndPtr(EndPtr);
auto *LoadR = new VPWidenLoadEVLRecipe(
- *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
+ *cast<VPWidenLoadRecipe>(ReversedVal), Addr, EVL, Mask);
LoadR->insertBefore(&CurRecipe);
return new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
@@ -3095,12 +3096,13 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
if (match(&CurRecipe,
m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
- match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF())))) {
+ match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
if (Mask)
Mask = GetVPReverse(Mask);
- return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
- AdjustEndPtr(EndPtr),
- GetVPReverse(ReversedVal), EVL, Mask);
+ Addr = AdjustEndPtr(EndPtr);
+ StoredVal = GetVPReverse(ReversedVal);
+ return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
+ StoredVal, EVL, Mask);
}
if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
>From d4aa8d00d2292bdbe17a2d996a76d9fb32912758 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 26 Mar 2026 23:36:07 -0700
Subject: [PATCH 5/9] remove redundant using namespace VPlanPatternMatch;
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 60ab60f4a9222..b268c81e550cf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7138,7 +7138,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
};
for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
- using namespace VPlanPatternMatch;
// Reverse operations for reverse memory accesses may be hoisted to the
// preheader by LICM if the reversed value is loop invariant. In this case,
// the VPlan-based cost model diverges from the legacy cost model.
@@ -7180,7 +7179,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
}
continue;
}
- using namespace VPlanPatternMatch;
// Unused FOR splices are removed by VPlan transforms, so the VPlan-based
// cost model won't cost it whilst the legacy will.
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
>From 5c1e5dc193bc306a7ab9cc5c87d65ae86234c8c7 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 26 Mar 2026 23:40:36 -0700
Subject: [PATCH 6/9] use this for inferScalarType
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0944c29763769..17859745b9cb8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1249,7 +1249,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
case VPInstruction::Reverse: {
assert(VF.isVector() && "Reverse operation must be vector type");
- Type *EltTy = Ctx.Types.inferScalarType(getOperand(0));
+ Type *EltTy = Ctx.Types.inferScalarType(this);
// Skip the reverse operation cost for the mask.
// FIXME: Remove this once redundant mask reverse operations can be
// eliminated by VPlanTransforms::cse before cost computation.
>From 2af537df9ddf55d91ff6e7dcef0d07d2f0457a04 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 26 Mar 2026 23:53:11 -0700
Subject: [PATCH 7/9] inline all-true mask check into GetVPReverse
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7efb2bf3de328..827784202d945 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3058,6 +3058,8 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
auto GetVPReverse = [&CurRecipe, &EVL, &TypeInfo, Plan,
DL](VPValue *V) -> VPWidenIntrinsicRecipe * {
+ if (!V)
+ return nullptr;
auto *Reverse = new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
TypeInfo.inferScalarType(V), {}, {}, DL);
@@ -3076,8 +3078,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
m_MaskedLoad(m_VPValue(EndPtr),
m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
- if (Mask)
- Mask = GetVPReverse(Mask);
+ Mask = GetVPReverse(Mask);
Addr = AdjustEndPtr(EndPtr);
auto *LoadR = new VPWidenLoadEVLRecipe(
*cast<VPWidenLoadRecipe>(ReversedVal), Addr, EVL, Mask);
@@ -3097,8 +3098,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
- if (Mask)
- Mask = GetVPReverse(Mask);
+ Mask = GetVPReverse(Mask);
Addr = AdjustEndPtr(EndPtr);
StoredVal = GetVPReverse(ReversedVal);
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
>From 71791942605911f176abcd8135acd56d05ded0d2 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 27 Mar 2026 10:23:49 -0700
Subject: [PATCH 8/9] Restore the reverse assertion
---
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 23 +++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 17859745b9cb8..654992ba50da2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3817,6 +3817,29 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
// TODO: Using the original IR may not be accurate.
// Currently, ARM will use the underlying IR to calculate gather/scatter
// instruction cost.
+#ifndef NDEBUG
+ auto IsReverse = [this]() {
+ // Check if mask is reversed
+ if (VPValue *Mask = getMask())
+ if (match(Mask, m_Reverse(m_VPValue())))
+ return true;
+
+ // For loads, check if the single user is a reverse operation
+ if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)) {
+ auto *U = getVPSingleValue()->getSingleUser();
+ return U && match(cast<VPRecipeBase>(U), m_Reverse(m_VPValue()));
+ }
+
+ // For stores, check if the stored value is reversed
+ VPValue *StoredVal =
+ isa<VPWidenStoreRecipe>(this)
+ ? cast<VPWidenStoreRecipe>(this)->getStoredValue()
+ : cast<VPWidenStoreEVLRecipe>(this)->getStoredValue();
+ return match(StoredVal, m_Reverse(m_VPValue()));
+ };
+ assert(!IsReverse() &&
+ "Inconsecutive memory access should not have reverse order");
+#endif
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
Type *PtrTy = Ptr->getType();
>From c4b9848f66df8d2b280129e4549c701a17afd5d6 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 31 Mar 2026 01:17:26 -0700
Subject: [PATCH 9/9] update IsReverse
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 654992ba50da2..7a53ebd375ca2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3817,20 +3817,19 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
// TODO: Using the original IR may not be accurate.
// Currently, ARM will use the underlying IR to calculate gather/scatter
// instruction cost.
-#ifndef NDEBUG
- auto IsReverse = [this]() {
- // Check if mask is reversed
+ [[maybe_unused]] auto IsReverse = [this]() {
+ // Check if mask is reversed.
if (VPValue *Mask = getMask())
if (match(Mask, m_Reverse(m_VPValue())))
return true;
- // For loads, check if the single user is a reverse operation
+ // For loads, check if the single user is a reverse operation.
if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)) {
auto *U = getVPSingleValue()->getSingleUser();
return U && match(cast<VPRecipeBase>(U), m_Reverse(m_VPValue()));
}
- // For stores, check if the stored value is reversed
+ // For stores, check if the stored value is reversed.
VPValue *StoredVal =
isa<VPWidenStoreRecipe>(this)
? cast<VPWidenStoreRecipe>(this)->getStoredValue()
@@ -3839,7 +3838,6 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
};
assert(!IsReverse() &&
"Inconsecutive memory access should not have reverse order");
-#endif
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
Type *PtrTy = Ptr->getType();
More information about the llvm-commits
mailing list