[llvm] [LV] Support strided load with a stride of -1 (PR #128718)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 2 01:56:16 PDT 2025
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/128718
>From c542dc80553fd3eb5f83c558b9f1053358e98c2e Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 19 Feb 2025 01:37:53 -0800
Subject: [PATCH 01/15] Step 1: Patch legal check for strided accesses
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 55cc801e91452..b16ea2e72c92c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1282,6 +1282,18 @@ class LoopVectorizationCostModel {
(SI && TTI.isLegalMaskedScatter(Ty, Align));
}
+ /// Returns true if the target machine can represent \p V as a strided load
+ /// or store operation.
+ bool isLegalStridedLoadStore(Value *V, ElementCount VF) {
+ if (!isa<LoadInst, StoreInst>(V))
+ return false;
+ auto *Ty = getLoadStoreType(V);
+ Align Align = getLoadStoreAlignment(V);
+ if (VF.isVector())
+ Ty = VectorType::get(Ty, VF);
+ return TTI.isLegalStridedLoadStore(Ty, Align);
+ }
+
/// Returns true if the target machine supports all of the reduction
/// variables found for the given VF.
bool canVectorizeReductions(ElementCount VF) const {
>From caef450c0cc8dc5682950ceb7527fdb5bc863fdf Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 19 Feb 2025 01:38:46 -0800
Subject: [PATCH 02/15] Step 2: Patch cost for strided accesses
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b16ea2e72c92c..a06720a32384d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1627,6 +1627,9 @@ class LoopVectorizationCostModel {
/// element)
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
+ /// The cost computation for strided load/store instruction.
+ InstructionCost getStridedLoadStoreCost(Instruction *I, ElementCount VF);
+
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
InstructionCost getScalarizationOverhead(Instruction *I,
@@ -5820,6 +5823,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
return Cost;
}
+InstructionCost
+LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
+ ElementCount VF) {
+ Type *ValTy = getLoadStoreType(I);
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+ const Align Alignment = getLoadStoreAlignment(I);
+ const Value *Ptr = getLoadStorePointerOperand(I);
+
+ return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
+ Legal->isMaskRequired(I), Alignment,
+ CostKind, I);
+}
+
std::optional<InstructionCost>
LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
ElementCount VF,
>From bfe2813f38c49bd7a29be1cd375e3dc93797a90f Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 20 Feb 2025 06:09:52 -0800
Subject: [PATCH 03/15] Step 3: Patch the implementation of load/store recipes
---
.../Transforms/Vectorize/LoopVectorize.cpp | 4 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 29 +++--
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 113 ++++++++++++++----
.../Transforms/Vectorize/VPlanTransforms.cpp | 4 +-
.../Transforms/Vectorize/VPlanTest.cpp | 7 +-
5 files changed, 118 insertions(+), 39 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a06720a32384d..19378278484b6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8381,12 +8381,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Ptr = VectorPtr;
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
- return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
+ return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, false,
I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, I->getDebugLoc());
+ Reverse, false, I->getDebugLoc());
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 37e0a176ab1cc..98c3c64226645 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2562,6 +2562,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// Whether the consecutive accessed addresses are in reverse order.
bool Reverse;
+ /// Whether the accessed addresses are evenly spaced apart by a fixed stride.
+ bool Strided;
+
/// Whether the memory access is masked.
bool IsMasked = false;
@@ -2575,9 +2578,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
- bool Consecutive, bool Reverse, DebugLoc DL)
+ bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive),
- Reverse(Reverse) {
+ Reverse(Reverse), Strided(Strided) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
}
@@ -2605,6 +2608,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// order.
bool isReverse() const { return Reverse; }
+ /// Return whether the accessed addresses are evenly spaced apart by a fixed
+ /// stride.
+ bool isStrided() const { return Strided; }
+
/// Return the address accessed by this recipe.
VPValue *getAddr() const { return getOperand(0); }
@@ -2634,16 +2641,16 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// optional mask.
struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
- bool Consecutive, bool Reverse, DebugLoc DL)
+ bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
- Reverse, DL),
+ Reverse, Strided, DL),
VPValue(this, &Load) {
setMask(Mask);
}
VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getMask(), Consecutive, Reverse,
+ getMask(), Consecutive, Reverse, Strided,
getDebugLoc());
}
@@ -2675,7 +2682,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
{L.getAddr(), &EVL}, L.isConsecutive(),
- L.isReverse(), L.getDebugLoc()),
+ L.isReverse(), L.isStrided(), L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -2712,16 +2719,17 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
/// to store to and an optional mask.
struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
- VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL)
+ VPValue *Mask, bool Consecutive, bool Reverse,
+ bool Strided, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
- Consecutive, Reverse, DL) {
+ Consecutive, Reverse, Strided, DL) {
setMask(Mask);
}
VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
- Reverse, getDebugLoc());
+ Reverse, Strided, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -2755,7 +2763,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{S.getAddr(), S.getStoredValue(), &EVL},
- S.isConsecutive(), S.isReverse(), S.getDebugLoc()) {
+ S.isConsecutive(), S.isReverse(), S.isStrided(),
+ S.getDebugLoc()) {
setMask(Mask);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f2d3b1588229a..ab78375a17f68 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2693,10 +2693,15 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
assert(!Reverse &&
"Inconsecutive memory access should not have the order.");
- return Ctx.TTI.getAddressComputationCost(Ty) +
- Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
- IsMasked, Alignment, Ctx.CostKind,
- &Ingredient);
+ if (Strided)
+ return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
+ IsMasked, Alignment, Ctx.CostKind,
+ &Ingredient);
+ else
+ return Ctx.TTI.getAddressComputationCost(Ty) +
+ Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
+ IsMasked, Alignment, Ctx.CostKind,
+ &Ingredient);
}
InstructionCost Cost = 0;
@@ -2723,11 +2728,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGather = !isConsecutive();
+ bool CreateGather = !isConsecutive() && !isStrided();
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
- Value *Mask = nullptr;
+ Value *Mask = isStrided()
+ ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
+ : nullptr;
if (auto *VPMask = getMask()) {
// Mask reversal is only needed for non-all-one (null) masks, as reverse
// of a null all-one mask is a null mask.
@@ -2742,9 +2749,25 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
"wide.masked.gather");
} else if (Mask) {
- NewLI =
- Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
- PoisonValue::get(DataTy), "wide.masked.load");
+ if (isStrided()) {
+ const DataLayout &DL = LI->getDataLayout();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
+ Value *RuntimeVF =
+ getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+ NewLI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+ {Addr, StrideVal, Mask, RuntimeVF}, nullptr, "wide.strided.load");
+ cast<CallInst>(NewLI)->addParamAttr(
+ 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+ } else {
+ NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
+ PoisonValue::get(DataTy),
+ "wide.masked.load");
+ }
} else {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
@@ -2782,7 +2805,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGather = !isConsecutive();
+ bool CreateGather = !isConsecutive() && !isStrided();
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
@@ -2802,6 +2825,16 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
NewLI =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
+ } else if (isStrided()) {
+ const DataLayout &DL = LI->getDataLayout();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
+ NewLI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+ {Addr, StrideVal, Mask, EVL}, nullptr, "wide.strided.load");
} else {
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
@@ -2856,13 +2889,15 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);
VPValue *StoredVPValue = getStoredValue();
- bool CreateScatter = !isConsecutive();
+ bool CreateScatter = !isConsecutive() && !isStrided();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
- Value *Mask = nullptr;
+ Value *Mask = isStrided()
+ ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
+ : nullptr;
if (auto *VPMask = getMask()) {
// Mask reversal is only needed for non-all-one (null) masks, as reverse
// of a null all-one mask is a null mask.
@@ -2881,12 +2916,32 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
}
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
Instruction *NewSI = nullptr;
- if (CreateScatter)
+ if (CreateScatter) {
NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
- else if (Mask)
- NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
- else
+ } else if (Mask) {
+ if (isStrided()) {
+ const DataLayout &DL = SI->getDataLayout();
+ auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
+ Type *StoredEltTy = StoredVecTy->getElementType();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
+ Value *RuntimeVF =
+ getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+ NewSI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_store,
+ {StoredVecTy, PtrTy, StrideTy},
+ {StoredVal, Addr, StrideVal, Mask, RuntimeVF});
+ cast<CallInst>(NewSI)->addParamAttr(
+ 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
+ } else {
+ NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
+ }
+ } else {
NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
+ }
State.addMetadata(NewSI, SI);
}
@@ -2902,7 +2957,7 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);
VPValue *StoredValue = getStoredValue();
- bool CreateScatter = !isConsecutive();
+ bool CreateScatter = !isConsecutive() && !isStrided();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
@@ -2927,11 +2982,25 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
- VectorBuilder VBuilder(Builder);
- VBuilder.setEVL(EVL).setMask(Mask);
- NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
- Instruction::Store, Type::getVoidTy(EVL->getContext()),
- {StoredVal, Addr}));
+ if (isStrided()) {
+ const DataLayout &DL = SI->getDataLayout();
+ auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
+ Type *StoredEltTy = StoredVecTy->getElementType();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
+ NewSI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
+ {StoredVecTy, PtrTy, StrideTy},
+ {StoredVal, Addr, StrideVal, Mask, EVL});
+ } else {
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Store, Type::getVoidTy(EVL->getContext()),
+ {StoredVal, Addr}));
+ }
}
NewSI->addParamAttr(
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9a041c83438dc..ba81fe48b0c05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -76,13 +76,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenLoadRecipe(
*Load, Ingredient.getOperand(0), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/,
+ false /*Consecutive*/, false /*Reverse*/, false /*Strided*/,
Ingredient.getDebugLoc());
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenStoreRecipe(
*Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
- Ingredient.getDebugLoc());
+ false /*Strided*/, Ingredient.getDebugLoc());
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index c1897e2c5d277..eb4fb9bf483e9 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1084,7 +1084,7 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
- VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {});
+ VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {});
EXPECT_TRUE(isa<VPUser>(&Recipe));
VPRecipeBase *BaseR = &Recipe;
EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1201,7 +1201,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
- VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {});
+ VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {});
EXPECT_FALSE(Recipe.mayHaveSideEffects());
EXPECT_TRUE(Recipe.mayReadFromMemory());
EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1215,7 +1215,8 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
- VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {});
+ VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, false,
+ {});
EXPECT_TRUE(Recipe.mayHaveSideEffects());
EXPECT_FALSE(Recipe.mayReadFromMemory());
EXPECT_TRUE(Recipe.mayWriteToMemory());
>From aa61d0f02794d1d8f3161c48287ca9f6905924a9 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 21 Feb 2025 02:38:30 -0800
Subject: [PATCH 04/15] Step 4: Connect the recipes and cost by instWidening
---
.../Transforms/Vectorize/LoopVectorize.cpp | 21 ++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 19378278484b6..6d6fdf6753e4e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1115,6 +1115,7 @@ class LoopVectorizationCostModel {
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
+ CM_Strided,
CM_Scalarize,
CM_VectorCall,
CM_IntrinsicCall
@@ -6155,6 +6156,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
"Expected consecutive stride.");
InstWidening Decision =
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+ // Consider using strided load/store for consecutive reverse accesses to
+ // achieve more efficient memory operations.
+ if (ConsecutiveStride == -1) {
+ const InstructionCost StridedLoadStoreCost =
+ isLegalStridedLoadStore(&I, VF) ? getStridedLoadStoreCost(&I, VF)
+ : InstructionCost::getInvalid();
+ if (StridedLoadStoreCost < Cost) {
+ Decision = CM_Strided;
+ Cost = StridedLoadStoreCost;
+ }
+ }
setWideningDecision(&I, VF, Decision, Cost);
continue;
}
@@ -6801,6 +6813,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return TTI::CastContextHint::Normal;
switch (getWideningDecision(I, VF)) {
+ // TODO: New CastContextHint for strided accesses.
+ case LoopVectorizationCostModel::CM_Strided:
case LoopVectorizationCostModel::CM_GatherScatter:
return TTI::CastContextHint::GatherScatter;
case LoopVectorizationCostModel::CM_Interleave:
@@ -8355,6 +8369,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
bool Consecutive =
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+ bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
if (Consecutive) {
@@ -8381,12 +8396,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Ptr = VectorPtr;
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
- return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, false,
- I->getDebugLoc());
+ return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
+ Strided, I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, false, I->getDebugLoc());
+ Reverse, Strided, I->getDebugLoc());
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
>From 3edd1227fe124d17018d98e713821a5daf07eb0d Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 25 Feb 2025 23:26:03 -0800
Subject: [PATCH 05/15] Opt: Remove dependency on VPWidenIntOrFpInductionRecipe
and expend VPVectorPointerRecipe
---
.../Transforms/Vectorize/LoopVectorize.cpp | 20 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 11 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 4 +-
.../RISCV/riscv-vector-reverse-output.ll | 274 ++++++++----------
...-force-tail-with-evl-reverse-load-store.ll | 71 ++---
...orize-force-tail-with-evl-uniform-store.ll | 27 +-
7 files changed, 185 insertions(+), 226 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6d6fdf6753e4e..cfc37eff3c396 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3563,9 +3563,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (IsUniformMemOpUse(I))
return true;
- return (WideningDecision == CM_Widen ||
- WideningDecision == CM_Widen_Reverse ||
- WideningDecision == CM_Interleave);
+ return (
+ WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
+ WideningDecision == CM_Strided || WideningDecision == CM_Interleave);
};
// Returns true if Ptr is the pointer operand of a memory access instruction
@@ -8366,17 +8366,27 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
// reverse consecutive.
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, Range.Start);
+
+ auto SameWiden = [&](ElementCount VF) -> bool {
+ return Decision == CM.getWideningDecision(I, VF);
+ };
+ bool ContainsWidenVF =
+ LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range);
+ assert(ContainsWidenVF &&
+ "At least widen the memory accesses by the Start VF.");
+
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
bool Consecutive =
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
- if (Consecutive) {
+ if (Consecutive || Strided) {
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
VPSingleDefRecipe *VectorPtr;
if (Reverse) {
+ assert(!Strided && "Reverse and Strided are mutually exclusive.");
// When folding the tail, we may compute an address that we don't in the
// original scalar loop and it may not be inbounds. Drop Inbounds in that
// case.
@@ -8387,7 +8397,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VectorPtr = new VPVectorEndPointerRecipe(
Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
} else {
- VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
+ VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), Strided,
GEP ? GEP->getNoWrapFlags()
: GEPNoWrapFlags::none(),
I->getDebugLoc());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 98c3c64226645..18089b6c6c571 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1599,12 +1599,15 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<1> {
Type *IndexedTy;
+ /// Indicate whether to compute the pointer for strided memory accesses.
+ bool Strided;
+
public:
- VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags,
- DebugLoc DL)
+ VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided,
+ GEPNoWrapFlags GEPFlags, DebugLoc DL)
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
GEPFlags, DL),
- IndexedTy(IndexedTy) {}
+ IndexedTy(IndexedTy), Strided(Strided) {}
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
@@ -1625,7 +1628,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
}
VPVectorPointerRecipe *clone() override {
- return new VPVectorPointerRecipe(getOperand(0), IndexedTy,
+ return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided,
getGEPNoWrapFlags(), getDebugLoc());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ab78375a17f68..66ff6eec21e5f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2257,7 +2257,9 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
CurrentPart, Builder);
Value *Ptr = State.get(getOperand(0), VPLane(0));
- Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
+ // TODO: Support non-unit-reverse strided accesses.
+ int64_t Step = Strided ? -1 * CurrentPart : CurrentPart;
+ Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Step);
Value *ResultPtr =
Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ba81fe48b0c05..d3cfaa0abf0cf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2600,7 +2600,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *L = new VPWidenLoadRecipe(
*cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
- /*Reverse=*/false, LoadGroup->getDebugLoc());
+ /*Reverse=*/false, /*Strided=*/false, LoadGroup->getDebugLoc());
L->insertBefore(LoadGroup);
return L;
}
@@ -2631,7 +2631,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *S = new VPWidenStoreRecipe(
*cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
- /*Reverse=*/false, StoreGroup->getDebugLoc());
+ /*Reverse=*/false, /*Strided=*/false, StoreGroup->getDebugLoc());
S->insertBefore(StoreGroup);
StoreGroup->eraseFromParent();
}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index f01aaa04606d9..64f7626183817 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -39,23 +39,19 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]]
-; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]]
-; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]]
-; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
-; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV64-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]]
-; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]]
-; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]]
-; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; RV64-NEXT: store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP19]], align 4
+; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; RV64-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 4
+; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; RV64-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
+; RV64-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; RV64-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; RV64-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP13]], ptr align 4 [[TMP15]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; RV64: [[MIDDLE_BLOCK]]:
; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -66,8 +62,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP21]], 1
+; RV64-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
+; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP19]], 1
; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
; RV64-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -96,25 +92,19 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
-; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]]
-; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]]
-; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]]
-; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV32-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]]
-; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]]
-; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]]
-; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; RV32-NEXT: store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP21]], align 4
+; RV32-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; RV32-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 4
+; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP10]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; RV32-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
+; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; RV32-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; RV32-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[TMP13]], ptr align 4 [[TMP15]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV32-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; RV32: [[MIDDLE_BLOCK]]:
; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -125,8 +115,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP23]], 1
+; RV32-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
+; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP19]], 1
; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
; RV32-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -147,57 +137,52 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
; RV64-UF2: [[VECTOR_BODY]]:
; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]]
-; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]]
-; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]]
-; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]]
-; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
-; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP18]], align 4
-; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD1]])
-; RV64-UF2-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV64-UF2-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
-; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
-; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
-; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP25]], align 4
-; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP29]], align 4
-; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64-UF2-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]]
+; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; RV64-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4294967292
+; RV64-UF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP11]]
+; RV64-UF2-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP9]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; RV64-UF2-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP12]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
+; RV64-UF2-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD1]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
+; RV64-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; RV64-UF2-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4294967292
+; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]]
+; RV64-UF2-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 4
+; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP17]], ptr align 4 [[TMP20]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP25]])
+; RV64-UF2-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4
+; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP23]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP27]])
+; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV64-UF2-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; RV64-UF2: [[MIDDLE_BLOCK]]:
; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; RV64-UF2: [[SCALAR_PH]]:
-; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
; RV64-UF2: [[FOR_BODY]]:
; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP31]], 1
+; RV64-UF2-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
+; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP29]], 1
; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
; RV64-UF2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -245,23 +230,19 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]]
-; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
-; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]]
-; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
-; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV64-NEXT: [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]]
-; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
-; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]]
-; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]])
-; RV64-NEXT: store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP19]], align 4
+; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
+; RV64-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 4
+; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; RV64-NEXT: [[TMP13:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
+; RV64-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; RV64-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; RV64-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP13]], ptr align 4 [[TMP15]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; RV64: [[MIDDLE_BLOCK]]:
; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -272,8 +253,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP21]], 1.000000e+00
+; RV64-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
+; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP19]], 1.000000e+00
; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
; RV64-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -302,25 +283,19 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
-; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]]
-; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]]
-; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]]
-; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
-; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV32-NEXT: [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]]
-; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]]
-; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]]
-; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]])
-; RV32-NEXT: store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP21]], align 4
+; RV32-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
+; RV32-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 4
+; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr align 4 [[TMP10]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; RV32-NEXT: [[TMP13:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
+; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; RV32-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; RV32-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i32(<vscale x 4 x float> [[TMP13]], ptr align 4 [[TMP15]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV32-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; RV32: [[MIDDLE_BLOCK]]:
; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -331,8 +306,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP23]], 1.000000e+00
+; RV32-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
+; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP19]], 1.000000e+00
; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
; RV32-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -353,57 +328,52 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
; RV64-UF2: [[VECTOR_BODY]]:
; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
-; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]]
-; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]]
-; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]]
-; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]]
-; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP18]], align 4
-; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1]])
-; RV64-UF2-NEXT: [[TMP19:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
-; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP19]])
-; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP25]], align 4
-; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP20]])
-; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP29]], align 4
-; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV64-UF2-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
+; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; RV64-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4294967292
+; RV64-UF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP11]]
+; RV64-UF2-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP9]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; RV64-UF2-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP12]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
+; RV64-UF2-NEXT: [[TMP17:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD1]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; RV64-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 0
+; RV64-UF2-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4294967292
+; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP22]]
+; RV64-UF2-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 4
+; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP17]], ptr align 4 [[TMP20]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP25]])
+; RV64-UF2-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4
+; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP23]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP27]])
+; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV64-UF2-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; RV64-UF2: [[MIDDLE_BLOCK]]:
; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; RV64-UF2: [[SCALAR_PH]]:
-; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
; RV64-UF2: [[FOR_BODY]]:
; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
+; RV64-UF2-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
+; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP29]], 1.000000e+00
; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
; RV64-UF2-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 695af0d241159..11beb84802296 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -33,21 +33,11 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]]
-; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
-; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP11]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
-; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]]
-; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
-; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
-; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], ptr align 4 [[TMP14]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -110,6 +100,7 @@ loopend:
}
define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noalias %ptr1, ptr noalias %ptr2) {
+;
; IF-EVL-LABEL: @reverse_load_store_masked(
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -122,49 +113,37 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
-; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[N_VEC]] to i32
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
-; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], -1
-; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX3]]
+; IF-EVL-NEXT: [[OFFSET_IDX1:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], -1
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX1]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[TMP8]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
-; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
-; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
-; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]]
-; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
-; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP13]], i64 -4, <vscale x 4 x i1> [[TMP11]], i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], ptr align 4 [[TMP15]], i64 -4, <vscale x 4 x i1> [[TMP11]], i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP16]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; IF-EVL-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
-; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL: for.body:
; IF-EVL-NEXT: [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index c7c8ee4326a8b..41a60115433a2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -36,30 +36,25 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP12:%.*]] = sub nuw nsw i64 1, [[OFFSET_IDX]]
-; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
+; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i64(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[TMP14]], i64 -8, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP15]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP16]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]]
-; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP19]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[TMP22:%.*]] = sub nuw nsw i64 1, [[IV1]]
-; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP22]]
-; CHECK-NEXT: store i64 0, ptr [[ARRAYIDX14]], align 8
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[TMP17:%.*]] = sub nuw nsw i64 1, [[IV]]
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP17]]
+; CHECK-NEXT: store i64 0, ptr [[ARRAYIDX13]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
>From ac735641140c5e0daf5fe64a19a2d88bf5902331 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 7 Mar 2025 15:43:20 +0800
Subject: [PATCH 06/15] Update llvm/lib/Transforms/Vectorize/VPlan.h
Initialize Strided to false at default.
Co-authored-by: Alexey Bataev <a.bataev at gmx.com>
---
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 18089b6c6c571..0d3ff54ab04b0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2566,7 +2566,7 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
bool Reverse;
/// Whether the accessed addresses are evenly spaced apart by a fixed stride.
- bool Strided;
+ bool Strided = false;
/// Whether the memory access is masked.
bool IsMasked = false;
>From 79419c82b805949791bd1e2c4224be59c9665678 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 7 Mar 2025 15:45:02 +0800
Subject: [PATCH 07/15] Update llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Remove unnecessary else
Co-authored-by: Alexey Bataev <a.bataev at gmx.com>
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 66ff6eec21e5f..a0397504a17c5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2699,11 +2699,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
IsMasked, Alignment, Ctx.CostKind,
&Ingredient);
- else
- return Ctx.TTI.getAddressComputationCost(Ty) +
- Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
- IsMasked, Alignment, Ctx.CostKind,
- &Ingredient);
+
+ return Ctx.TTI.getAddressComputationCost(Ty) +
+ Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
+ IsMasked, Alignment, Ctx.CostKind,
+ &Ingredient);
}
InstructionCost Cost = 0;
>From 6dfbf2b33af7d61b8d14ace3b806761bb7a00d08 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 7 Mar 2025 00:03:42 -0800
Subject: [PATCH 08/15] Fix: Correct the vector pointer when UF>1
---
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 +-
.../RISCV/riscv-vector-reverse-output.ll | 96 +++++++++--------
.../RISCV/riscv-vector-reverse.ll | 102 +++++++++---------
3 files changed, 106 insertions(+), 101 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a0397504a17c5..c8f153ddb13b1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2257,11 +2257,14 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
CurrentPart, Builder);
Value *Ptr = State.get(getOperand(0), VPLane(0));
+ Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
// TODO: Support non-unit-reverse strided accesses.
- int64_t Step = Strided ? -1 * CurrentPart : CurrentPart;
- Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Step);
+ Value *Index =
+ Strided
+ ? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1))
+ : Increment;
Value *ResultPtr =
- Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
+ Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
State.set(this, ResultPtr, /*IsScalar*/ true);
}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index 64f7626183817..8c58df32c564a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -147,30 +147,32 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]]
; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
; RV64-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4294967292
-; RV64-UF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP11]]
-; RV64-UF2-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4
-; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP9]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
-; RV64-UF2-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4
-; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP12]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
-; RV64-UF2-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
-; RV64-UF2-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD1]], splat (i32 1)
-; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
-; RV64-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; RV64-UF2-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4294967292
-; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]]
-; RV64-UF2-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 4
-; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP17]], ptr align 4 [[TMP20]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP25]])
+; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], -1
+; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP12]]
+; RV64-UF2-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP9]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; RV64-UF2-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP13]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
+; RV64-UF2-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD1]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
+; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0
+; RV64-UF2-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4
+; RV64-UF2-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], -1
+; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 [[TMP24]]
; RV64-UF2-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP23]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP27]])
+; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP21]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP27]])
+; RV64-UF2-NEXT: [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], 4
+; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP19]], ptr align 4 [[TMP25]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP29]])
; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV64-UF2-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-UF2-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; RV64-UF2: [[MIDDLE_BLOCK]]:
; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -181,8 +183,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP29]], 1
+; RV64-UF2-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
+; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP31]], 1
; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
; RV64-UF2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -338,30 +340,32 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
; RV64-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4294967292
-; RV64-UF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP11]]
-; RV64-UF2-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4
-; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP9]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
-; RV64-UF2-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4
-; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP12]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
-; RV64-UF2-NEXT: [[TMP17:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT: [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD1]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
-; RV64-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 0
-; RV64-UF2-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4294967292
-; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP22]]
-; RV64-UF2-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 4
-; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP17]], ptr align 4 [[TMP20]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP25]])
+; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], -1
+; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP12]]
+; RV64-UF2-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP9]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; RV64-UF2-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP13]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
+; RV64-UF2-NEXT: [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP19:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD1]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
+; RV64-UF2-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4
+; RV64-UF2-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], -1
+; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP24]]
; RV64-UF2-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP23]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP27]])
+; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP21]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP27]])
+; RV64-UF2-NEXT: [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], 4
+; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP19]], ptr align 4 [[TMP25]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP29]])
; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV64-UF2-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-UF2-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; RV64-UF2: [[MIDDLE_BLOCK]]:
; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -372,8 +376,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP29]], 1.000000e+00
+; RV64-UF2-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
+; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
; RV64-UF2-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 5e4bd284e1fa8..eb2576c2acad2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -38,10 +38,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -74,15 +74,15 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, vp<[[VF]]>
-; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, vp<[[VF]]>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
; CHECK-NEXT: No successors
@@ -113,10 +113,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -141,7 +141,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 32
+; CHECK-NEXT: LV: Loop cost is 22
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
; CHECK-NEXT: LV: Not Interleaving.
@@ -150,8 +150,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK: Executing best plan with VF=vscale x 4, UF=1
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF
+; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]> = VF * UF
; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: ir<%0> = original trip-count
; CHECK-EMPTY:
@@ -196,16 +195,16 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, ir<1>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -286,10 +285,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -322,15 +321,15 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, vp<[[VF]]>
-; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, vp<[[VF]]>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
; CHECK-NEXT: No successors
@@ -361,10 +360,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -389,7 +388,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 34
+; CHECK-NEXT: LV: Loop cost is 24
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
; CHECK-NEXT: LV: Not Interleaving.
@@ -398,8 +397,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK: Executing best plan with VF=vscale x 4, UF=1
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF
+; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]> = VF * UF
; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: ir<%0> = original trip-count
; CHECK-EMPTY:
@@ -444,16 +442,16 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%conv1>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
+; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
>From 293bf7ce3fc7abb102e8f14f8308659dc5ec9af0 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 10 Mar 2025 01:27:34 -0700
Subject: [PATCH 09/15] NFC: Add const, and check the legal function.
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cfc37eff3c396..1432575a1c922 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1285,10 +1285,14 @@ class LoopVectorizationCostModel {
/// Returns true if the target machine can represent \p V as a strided load
/// or store operation.
- bool isLegalStridedLoadStore(Value *V, ElementCount VF) {
+ bool isLegalStridedLoadStore(Value *V, ElementCount VF) const {
if (!isa<LoadInst, StoreInst>(V))
return false;
auto *Ty = getLoadStoreType(V);
+ Value *Ptr = getLoadStorePointerOperand(V);
+ // TODO: Support non-unit-reverse strided accesses.
+ if (Legal->isConsecutivePtr(Ty, Ptr) != -1)
+ return false;
Align Align = getLoadStoreAlignment(V);
if (VF.isVector())
Ty = VectorType::get(Ty, VF);
@@ -1629,7 +1633,8 @@ class LoopVectorizationCostModel {
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
/// The cost computation for strided load/store instruction.
- InstructionCost getStridedLoadStoreCost(Instruction *I, ElementCount VF);
+ InstructionCost getStridedLoadStoreCost(Instruction *I,
+ ElementCount VF) const;
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
@@ -5826,7 +5831,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
- ElementCount VF) {
+ ElementCount VF) const {
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
>From a0a580d44ab19a1e94459767b1b464014eb714d8 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 12 Mar 2025 01:50:39 -0700
Subject: [PATCH 10/15] NFC: Replace the isLegalStridedLoadStore with
stridedAccessCanBeWidened
---
.../Transforms/Vectorize/LoopVectorize.cpp | 45 +++++++++++--------
1 file changed, 26 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1432575a1c922..4cd7a6302f1e4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1283,22 +1283,6 @@ class LoopVectorizationCostModel {
(SI && TTI.isLegalMaskedScatter(Ty, Align));
}
- /// Returns true if the target machine can represent \p V as a strided load
- /// or store operation.
- bool isLegalStridedLoadStore(Value *V, ElementCount VF) const {
- if (!isa<LoadInst, StoreInst>(V))
- return false;
- auto *Ty = getLoadStoreType(V);
- Value *Ptr = getLoadStorePointerOperand(V);
- // TODO: Support non-unit-reverse strided accesses.
- if (Legal->isConsecutivePtr(Ty, Ptr) != -1)
- return false;
- Align Align = getLoadStoreAlignment(V);
- if (VF.isVector())
- Ty = VectorType::get(Ty, VF);
- return TTI.isLegalStridedLoadStore(Ty, Align);
- }
-
/// Returns true if the target machine supports all of the reduction
/// variables found for the given VF.
bool canVectorizeReductions(ElementCount VF) const {
@@ -1363,6 +1347,10 @@ class LoopVectorizationCostModel {
return InterleaveInfo.getInterleaveGroup(Instr);
}
+ /// Returns true if \p I is a memory instruction with strided memory access
+ /// that can be vectorized.
+ bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
+
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
bool requiresScalarEpilogue(bool IsVectorizing) const {
@@ -3478,6 +3466,26 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
return true;
}
+bool LoopVectorizationCostModel::stridedAccessCanBeWidened(
+ Instruction *I, ElementCount VF) const {
+ // Get and ensure we have a valid memory instruction.
+ assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
+
+ // Only support strided access for vector VF.
+ if (!VF.isVector())
+ return false;
+
+ auto *Ptr = getLoadStorePointerOperand(I);
+ auto *ScalarTy = getLoadStoreType(I);
+ // Ensure the accessed addresses are evenly spaced apart by a fixed stride.
+ // TODO: Support non-unit-reverse strided accesses.
+ if (Legal->isConsecutivePtr(ScalarTy, Ptr) != -1)
+ return false;
+
+ const Align Alignment = getLoadStoreAlignment(I);
+ return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment);
+}
+
void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// We should not collect Uniforms more than once per VF. Right now,
// this function is called from collectUniformsAndScalars(), which
@@ -6163,10 +6171,9 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
// Consider using strided load/store for consecutive reverse accesses to
// achieve more efficient memory operations.
- if (ConsecutiveStride == -1) {
+ if (ConsecutiveStride == -1 && stridedAccessCanBeWidened(&I, VF)) {
const InstructionCost StridedLoadStoreCost =
- isLegalStridedLoadStore(&I, VF) ? getStridedLoadStoreCost(&I, VF)
- : InstructionCost::getInvalid();
+ getStridedLoadStoreCost(&I, VF);
if (StridedLoadStoreCost < Cost) {
Decision = CM_Strided;
Cost = StridedLoadStoreCost;
>From e5738bfddb95e38bd284f886ea0d3b59684c76e5 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 12 Mar 2025 06:27:41 -0700
Subject: [PATCH 11/15] Fix onlyFirstLaneUsed
---
llvm/lib/Transforms/Vectorize/VPlan.h | 27 +++++++++++++++------------
1 file changed, 15 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0d3ff54ab04b0..f8f1a97e16b4c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2672,9 +2672,9 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
- // Widened, consecutive loads operations only demand the first lane of
- // their address.
- return Op == getAddr() && isConsecutive();
+ // Widened, consecutive/strided loads operations only demand the first
+ // lane of their address.
+ return Op == getAddr() && (isConsecutive() || isStrided());
}
};
@@ -2712,9 +2712,10 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
- // Widened loads only demand the first lane of EVL and consecutive loads
- // only demand the first lane of their address.
- return Op == getEVL() || (Op == getAddr() && isConsecutive());
+ // Widened loads only demand the first lane of EVL and consecutive/strided
+ // loads only demand the first lane of their address.
+ return Op == getEVL() ||
+ (Op == getAddr() && (isConsecutive() || isStrided()));
}
};
@@ -2753,9 +2754,10 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
- // Widened, consecutive stores only demand the first lane of their address,
- // unless the same operand is also stored.
- return Op == getAddr() && isConsecutive() && Op != getStoredValue();
+ // Widened, consecutive/strided stores only demand the first lane of their
+ // address, unless the same operand is also stored.
+ return Op == getAddr() && (isConsecutive() || isStrided()) &&
+ Op != getStoredValue();
}
};
@@ -2800,10 +2802,11 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
assert(getStoredValue() != Op && "unexpected store of EVL");
return true;
}
- // Widened, consecutive memory operations only demand the first lane of
- // their address, unless the same operand is also stored. That latter can
+ // Widened, consecutive/strided memory operations only demand the first lane
+ // of their address, unless the same operand is also stored. That latter can
// happen with opaque pointers.
- return Op == getAddr() && isConsecutive() && Op != getStoredValue();
+ return Op == getAddr() && (isConsecutive() || isStrided()) &&
+ Op != getStoredValue();
}
};
>From 4c7d4ccb3f751128a427f63abcbaabbdc0d5b2f4 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 12 Mar 2025 06:30:21 -0700
Subject: [PATCH 12/15] NFC: Update comment for VPVectorPointerRecipe
---
llvm/lib/Transforms/Vectorize/VPlan.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f8f1a97e16b4c..bc56dfa2ba3aa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1595,6 +1595,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
};
/// A recipe to compute the pointers for widened memory accesses of IndexTy.
+/// Supports both consecutive and reverse consecutive accesses.
+/// TODO: Support non-unit strided accesses .
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<1> {
Type *IndexedTy;
>From 6d78ece732e1e7a00120d50f29293308c581d268 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 17 Mar 2025 05:40:05 -0700
Subject: [PATCH 13/15] NFC: Assert stride == -1
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4cd7a6302f1e4..2081207c876c3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3475,12 +3475,13 @@ bool LoopVectorizationCostModel::stridedAccessCanBeWidened(
if (!VF.isVector())
return false;
- auto *Ptr = getLoadStorePointerOperand(I);
+ [[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I);
auto *ScalarTy = getLoadStoreType(I);
- // Ensure the accessed addresses are evenly spaced apart by a fixed stride.
- // TODO: Support non-unit-reverse strided accesses.
- if (Legal->isConsecutivePtr(ScalarTy, Ptr) != -1)
- return false;
+ // TODO: Support non-unit-reverse strided accesses. Add stride analysis here
+ // to ensure that the accessed addresses are evenly spaced apart by a fixed
+ // stride.
+ assert(Legal->isConsecutivePtr(ScalarTy, Ptr) == -1 &&
+ "Only supports strided accesses with a stride of -1");
const Align Alignment = getLoadStoreAlignment(I);
return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment);
>From 64cc31df6706c2d438ca04d7801433830e8b3204 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 25 Mar 2025 07:43:53 -0700
Subject: [PATCH 14/15] New Recipe VPWidenStridedLoadRecipe
- Remove strided store
- Need CM.getStride
---
.../Transforms/Vectorize/LoopVectorize.cpp | 22 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 102 +++++---
.../Transforms/Vectorize/VPlanAnalysis.cpp | 6 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 164 ++++++------
.../Transforms/Vectorize/VPlanTransforms.cpp | 17 +-
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 +
.../Transforms/Vectorize/VPlanVerifier.cpp | 3 +-
.../RISCV/riscv-vector-reverse-output.ll | 234 +++++++++---------
.../RISCV/riscv-vector-reverse.ll | 38 +--
...-force-tail-with-evl-reverse-load-store.ll | 74 +++---
...orize-force-tail-with-evl-uniform-store.ll | 27 +-
.../Transforms/Vectorize/VPlanTest.cpp | 7 +-
12 files changed, 385 insertions(+), 310 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2081207c876c3..89a21e52b9fba 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3475,6 +3475,10 @@ bool LoopVectorizationCostModel::stridedAccessCanBeWidened(
if (!VF.isVector())
return false;
+ // FIXME: Remove this check for StoreInst after strided store is supported.
+ if (isa<StoreInst>(I))
+ return false;
+
[[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I);
auto *ScalarTy = getLoadStoreType(I);
// TODO: Support non-unit-reverse strided accesses. Add stride analysis here
@@ -4424,7 +4428,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
[](const auto *R) { return Instruction::Select; })
.Case<VPWidenStoreRecipe>(
[](const auto *R) { return Instruction::Store; })
- .Case<VPWidenLoadRecipe>(
+ .Case<VPWidenLoadRecipe, VPWidenStridedLoadRecipe>(
[](const auto *R) { return Instruction::Load; })
.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
[](const auto *R) { return Instruction::Call; })
@@ -4523,6 +4527,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenPointerInductionSC:
case VPDef::VPReductionPHISC:
case VPDef::VPInterleaveSC:
+ case VPDef::VPWidenStridedLoadSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
case VPDef::VPWidenStoreEVLSC:
@@ -8418,13 +8423,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}
- if (LoadInst *Load = dyn_cast<LoadInst>(I))
+ if (LoadInst *Load = dyn_cast<LoadInst>(I)) {
+ if (Strided) {
+ const DataLayout &DL = Load->getDataLayout();
+ auto *StrideTy = DL.getIndexType(Load->getPointerOperand()->getType());
+ VPValue *Stride = Plan.getOrAddLiveIn(ConstantInt::get(
+ StrideTy, -1 * DL.getTypeAllocSize(getLoadStoreType(Load))));
+ return new VPWidenStridedLoadRecipe(*Load, Ptr, Stride, &Plan.getVF(),
+ Mask, I->getDebugLoc());
+ }
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- Strided, I->getDebugLoc());
+ I->getDebugLoc());
+ }
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, Strided, I->getDebugLoc());
+ Reverse, I->getDebugLoc());
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bc56dfa2ba3aa..a7247bd21885c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -537,6 +537,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPBranchOnMaskSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
+ case VPRecipeBase::VPWidenStridedLoadSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
case VPRecipeBase::VPWidenStoreEVLSC:
@@ -2567,9 +2568,6 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// Whether the consecutive accessed addresses are in reverse order.
bool Reverse;
- /// Whether the accessed addresses are evenly spaced apart by a fixed stride.
- bool Strided = false;
-
/// Whether the memory access is masked.
bool IsMasked = false;
@@ -2583,9 +2581,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
- bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
+ bool Consecutive, bool Reverse, DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive),
- Reverse(Reverse), Strided(Strided) {
+ Reverse(Reverse) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
}
@@ -2598,7 +2596,8 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
- R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
+ R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC;
}
static inline bool classof(const VPUser *U) {
@@ -2613,10 +2612,6 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// order.
bool isReverse() const { return Reverse; }
- /// Return whether the accessed addresses are evenly spaced apart by a fixed
- /// stride.
- bool isStrided() const { return Strided; }
-
/// Return the address accessed by this recipe.
VPValue *getAddr() const { return getOperand(0); }
@@ -2646,16 +2641,16 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// optional mask.
struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
- bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
+ bool Consecutive, bool Reverse, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
- Reverse, Strided, DL),
+ Reverse, DL),
VPValue(this, &Load) {
setMask(Mask);
}
VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getMask(), Consecutive, Reverse, Strided,
+ getMask(), Consecutive, Reverse,
getDebugLoc());
}
@@ -2674,9 +2669,9 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
- // Widened, consecutive/strided loads operations only demand the first
- // lane of their address.
- return Op == getAddr() && (isConsecutive() || isStrided());
+ // Widened, consecutive loads operations only demand the first lane of their
+ // address.
+ return Op == getAddr() && isConsecutive();
}
};
@@ -2687,7 +2682,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
{L.getAddr(), &EVL}, L.isConsecutive(),
- L.isReverse(), L.isStrided(), L.getDebugLoc()),
+ L.isReverse(), L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -2716,8 +2711,55 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
"Op must be an operand of the recipe");
// Widened loads only demand the first lane of EVL and consecutive/strided
// loads only demand the first lane of their address.
- return Op == getEVL() ||
- (Op == getAddr() && (isConsecutive() || isStrided()));
+ return Op == getEVL() || (Op == getAddr() && isConsecutive());
+ }
+};
+
+/// A recipe for strided load operations, using the base address, stride, and an
+/// optional mask.
+struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
+ public VPValue {
+ VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride,
+ VPValue *VF, VPValue *Mask, DebugLoc DL)
+ : VPWidenMemoryRecipe(VPDef::VPWidenStridedLoadSC, Load,
+ {Addr, Stride, VF},
+ /*Consecutive=*/false, /*Reverse=*/false, DL),
+ VPValue(this, &Load) {
+ setMask(Mask);
+ }
+
+ VPWidenStridedLoadRecipe *clone() override {
+ return new VPWidenStridedLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
+ getStride(), getVF(), getMask(),
+ getDebugLoc());
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC);
+
+ /// Return the stride operand.
+ VPValue *getStride() const { return getOperand(1); }
+
+ /// Return the VF operand.
+ VPValue *getVF() const { return getOperand(2); }
+
+ /// Generate a strided load.
+ void execute(VPTransformState &State) override;
+
+ /// Return the cost of this VPWidenStridedLoadRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getAddr() || Op == getStride() || Op == getVF();
}
};
@@ -2725,17 +2767,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
/// to store to and an optional mask.
struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
- VPValue *Mask, bool Consecutive, bool Reverse,
- bool Strided, DebugLoc DL)
+ VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
- Consecutive, Reverse, Strided, DL) {
+ Consecutive, Reverse, DL) {
setMask(Mask);
}
VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
- Reverse, Strided, getDebugLoc());
+ Reverse, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -2756,10 +2797,9 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
- // Widened, consecutive/strided stores only demand the first lane of their
+ // Widened, consecutive stores only demand the first lane of their
// address, unless the same operand is also stored.
- return Op == getAddr() && (isConsecutive() || isStrided()) &&
- Op != getStoredValue();
+ return Op == getAddr() && isConsecutive() && Op != getStoredValue();
}
};
@@ -2770,8 +2810,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{S.getAddr(), S.getStoredValue(), &EVL},
- S.isConsecutive(), S.isReverse(), S.isStrided(),
- S.getDebugLoc()) {
+ S.isConsecutive(), S.isReverse(), S.getDebugLoc()) {
setMask(Mask);
}
@@ -2804,11 +2843,10 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
assert(getStoredValue() != Op && "unexpected store of EVL");
return true;
}
- // Widened, consecutive/strided memory operations only demand the first lane
- // of their address, unless the same operand is also stored. That latter can
+ // Widened, consecutive memory operations only demand the first lane of
+ // their address, unless the same operand is also stored. That latter can
// happen with opaque pointers.
- return Op == getAddr() && (isConsecutive() || isStrided()) &&
- Op != getStoredValue();
+ return Op == getAddr() && isConsecutive() && Op != getStoredValue();
}
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 24a166bd336d1..94031b0eabd4b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -159,8 +159,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
- assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
- "Store recipes should not define any values");
+ assert(
+ (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+ R)) &&
+ "Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c8f153ddb13b1..e7c5dcd719599 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -76,6 +76,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenCastSC:
case VPWidenGEPSC:
case VPWidenIntOrFpInductionSC:
+ case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenPHISC:
@@ -97,6 +98,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
switch (getVPDefID()) {
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
+ case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
return true;
@@ -175,6 +177,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
}
case VPInterleaveSC:
return mayWriteToMemory();
+ case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenStoreEVLSC:
@@ -2698,11 +2701,6 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
assert(!Reverse &&
"Inconsecutive memory access should not have the order.");
- if (Strided)
- return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
- IsMasked, Alignment, Ctx.CostKind,
- &Ingredient);
-
return Ctx.TTI.getAddressComputationCost(Ty) +
Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
IsMasked, Alignment, Ctx.CostKind,
@@ -2733,13 +2731,11 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGather = !isConsecutive() && !isStrided();
+ bool CreateGather = !isConsecutive();
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
- Value *Mask = isStrided()
- ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
- : nullptr;
+ Value *Mask = nullptr;
if (auto *VPMask = getMask()) {
// Mask reversal is only needed for non-all-one (null) masks, as reverse
// of a null all-one mask is a null mask.
@@ -2754,25 +2750,9 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
"wide.masked.gather");
} else if (Mask) {
- if (isStrided()) {
- const DataLayout &DL = LI->getDataLayout();
- auto *PtrTy = Addr->getType();
- auto *StrideTy = DL.getIndexType(PtrTy);
- // TODO: Support non-unit-reverse strided accesses.
- auto *StrideVal =
- ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
- Value *RuntimeVF =
- getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
- NewLI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
- {Addr, StrideVal, Mask, RuntimeVF}, nullptr, "wide.strided.load");
- cast<CallInst>(NewLI)->addParamAttr(
- 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
- } else {
- NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
- PoisonValue::get(DataTy),
- "wide.masked.load");
- }
+ NewLI =
+ Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
+ PoisonValue::get(DataTy), "wide.masked.load");
} else {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
@@ -2810,7 +2790,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGather = !isConsecutive() && !isStrided();
+ bool CreateGather = !isConsecutive();
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
@@ -2830,16 +2810,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
NewLI =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
- } else if (isStrided()) {
- const DataLayout &DL = LI->getDataLayout();
- auto *PtrTy = Addr->getType();
- auto *StrideTy = DL.getIndexType(PtrTy);
- // TODO: Support non-unit-reverse strided accesses.
- auto *StrideVal =
- ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
- NewLI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
- {Addr, StrideVal, Mask, EVL}, nullptr, "wide.strided.load");
} else {
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
@@ -2890,19 +2860,73 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
+ auto *LI = cast<LoadInst>(&Ingredient);
+
+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+ auto &Builder = State.Builder;
+ State.setDebugLocFrom(getDebugLoc());
+ Value *Addr = State.get(getAddr(), /*IsScalar*/ true);
+ Value *Stride = State.get(getStride(), /*IsScalar*/ true);
+ Value *Mask = nullptr;
+ if (VPValue *VPMask = getMask())
+ Mask = State.get(VPMask);
+ else
+ Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+ Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)),
+ Builder.getInt32Ty());
+
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = Stride->getType();
+ CallInst *NewLI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+ {Addr, Stride, Mask, RunTimeVF}, nullptr, "wide.strided.load");
+ NewLI->addParamAttr(
+ 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+ State.addMetadata(NewLI, LI);
+ State.set(this, NewLI);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN ";
+ printAsOperand(O, SlotTracker);
+ O << " = load ";
+ getAddr()->printAsOperand(O, SlotTracker);
+ O << ", stride = ";
+ getStride()->printAsOperand(O, SlotTracker);
+ O << ", runtimeVF = ";
+ getVF()->printAsOperand(O, SlotTracker);
+}
+#endif
+
+InstructionCost
+VPWidenStridedLoadRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+ const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+
+ return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
+ IsMasked, Alignment, Ctx.CostKind,
+ &Ingredient);
+}
+
void VPWidenStoreRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);
VPValue *StoredVPValue = getStoredValue();
- bool CreateScatter = !isConsecutive() && !isStrided();
+ bool CreateScatter = !isConsecutive();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
- Value *Mask = isStrided()
- ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
- : nullptr;
+ Value *Mask = nullptr;
if (auto *VPMask = getMask()) {
// Mask reversal is only needed for non-all-one (null) masks, as reverse
// of a null all-one mask is a null mask.
@@ -2921,32 +2945,12 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
}
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
Instruction *NewSI = nullptr;
- if (CreateScatter) {
+ if (CreateScatter)
NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
- } else if (Mask) {
- if (isStrided()) {
- const DataLayout &DL = SI->getDataLayout();
- auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
- Type *StoredEltTy = StoredVecTy->getElementType();
- auto *PtrTy = Addr->getType();
- auto *StrideTy = DL.getIndexType(PtrTy);
- // TODO: Support non-unit-reverse strided accesses.
- auto *StrideVal =
- ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
- Value *RuntimeVF =
- getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
- NewSI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vp_strided_store,
- {StoredVecTy, PtrTy, StrideTy},
- {StoredVal, Addr, StrideVal, Mask, RuntimeVF});
- cast<CallInst>(NewSI)->addParamAttr(
- 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
- } else {
- NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
- }
- } else {
+ else if (Mask)
+ NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
+ else
NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
- }
State.addMetadata(NewSI, SI);
}
@@ -2962,7 +2966,7 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);
VPValue *StoredValue = getStoredValue();
- bool CreateScatter = !isConsecutive() && !isStrided();
+ bool CreateScatter = !isConsecutive();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
@@ -2987,25 +2991,11 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
- if (isStrided()) {
- const DataLayout &DL = SI->getDataLayout();
- auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
- Type *StoredEltTy = StoredVecTy->getElementType();
- auto *PtrTy = Addr->getType();
- auto *StrideTy = DL.getIndexType(PtrTy);
- // TODO: Support non-unit-reverse strided accesses.
- auto *StrideVal =
- ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
- NewSI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
- {StoredVecTy, PtrTy, StrideTy},
- {StoredVal, Addr, StrideVal, Mask, EVL});
- } else {
- VectorBuilder VBuilder(Builder);
- VBuilder.setEVL(EVL).setMask(Mask);
- NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
- Instruction::Store, Type::getVoidTy(EVL->getContext()),
- {StoredVal, Addr}));
- }
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Store, Type::getVoidTy(EVL->getContext()),
+ {StoredVal, Addr}));
}
NewSI->addParamAttr(
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d3cfaa0abf0cf..e57918fdc1b54 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -76,13 +76,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenLoadRecipe(
*Load, Ingredient.getOperand(0), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/, false /*Strided*/,
+ false /*Consecutive*/, false /*Reverse*/,
Ingredient.getDebugLoc());
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenStoreRecipe(
*Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
- false /*Strided*/, Ingredient.getDebugLoc());
+ Ingredient.getDebugLoc());
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -1907,6 +1907,12 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
VPValue *NewMask = GetNewMask(L->getMask());
return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
})
+ .Case<VPWidenStridedLoadRecipe>([&](VPWidenStridedLoadRecipe *L) {
+ VPValue *NewMask = GetNewMask(L->getMask());
+ return new VPWidenStridedLoadRecipe(
+ *cast<LoadInst>(&L->getIngredient()), L->getAddr(), L->getStride(),
+ &EVL, NewMask, L->getDebugLoc());
+ })
.Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
VPValue *NewMask = GetNewMask(S->getMask());
return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
@@ -2006,7 +2012,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
NumDefVal <= 1 &&
"Only supports recipes with a single definition or without users.");
EVLRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+ EVLRecipe)) {
VPValue *CurVPV = CurRecipe->getVPSingleValue();
CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
}
@@ -2600,7 +2607,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *L = new VPWidenLoadRecipe(
*cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
- /*Reverse=*/false, /*Strided=*/false, LoadGroup->getDebugLoc());
+ /*Reverse=*/false, LoadGroup->getDebugLoc());
L->insertBefore(LoadGroup);
return L;
}
@@ -2631,7 +2638,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *S = new VPWidenStoreRecipe(
*cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
- /*Reverse=*/false, /*Strided=*/false, StoreGroup->getDebugLoc());
+ /*Reverse=*/false, StoreGroup->getDebugLoc());
S->insertBefore(StoreGroup);
StoreGroup->eraseFromParent();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 2b762d0533d19..24b9ebfd6ae70 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -334,6 +334,7 @@ class VPDef {
VPReplicateSC,
VPScalarCastSC,
VPScalarIVStepsSC,
+ VPWidenStridedLoadSC,
VPVectorPointerSC,
VPVectorEndPointerSC,
VPWidenCallSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index f7fa659ba6a8a..0d3bbf365c1ae 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -143,7 +143,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
.Case<VPWidenIntrinsicRecipe>([&](const VPWidenIntrinsicRecipe *S) {
return VerifyEVLUse(*S, S->getNumOperands() - 1);
})
- .Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe>(
+ .Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe,
+ VPWidenStridedLoadRecipe>(
[&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
.Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>(
[&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index 8c58df32c564a..c5e3a04e3dfe8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -40,15 +40,16 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; RV64-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 4
-; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
-; RV64-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
-; RV64-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; RV64-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4
-; RV64-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP13]], ptr align 4 [[TMP15]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
+; RV64-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32
+; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; RV64-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
+; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; RV64-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP14]]
+; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]]
+; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; RV64-NEXT: store <vscale x 4 x i32> [[REVERSE]], ptr [[TMP17]], align 4
; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -93,18 +94,20 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
; RV32-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; RV32-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; RV32-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 4
-; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP10]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
-; RV32-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
-; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; RV32-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; RV32-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4
-; RV32-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[TMP13]], ptr align 4 [[TMP15]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
+; RV32-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP10]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; RV32-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
+; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; RV32-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT: [[TMP15:%.*]] = mul i32 0, [[TMP14]]
+; RV32-NEXT: [[TMP16:%.*]] = sub i32 1, [[TMP14]]
+; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP15]]
+; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 [[TMP16]]
+; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; RV32-NEXT: store <vscale x 4 x i32> [[REVERSE]], ptr [[TMP18]], align 4
; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV32-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV32-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; RV32: [[MIDDLE_BLOCK]]:
; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -115,8 +118,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP19]], 1
+; RV32-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
+; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP20]], 1
; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
; RV32-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -137,54 +140,54 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
; RV64-UF2: [[VECTOR_BODY]]:
; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]]
-; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; RV64-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], -1
-; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP12]]
-; RV64-UF2-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP9]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
-; RV64-UF2-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4
-; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP13]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
-; RV64-UF2-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
-; RV64-UF2-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD1]], splat (i32 1)
-; RV64-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
-; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0
-; RV64-UF2-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4
-; RV64-UF2-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], -1
-; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 [[TMP24]]
-; RV64-UF2-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP21]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP27]])
-; RV64-UF2-NEXT: [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], 4
-; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP19]], ptr align 4 [[TMP25]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP29]])
-; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1
+; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP13]]
+; RV64-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP5]] to i32
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; RV64-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP5]] to i32
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP14]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
+; RV64-UF2-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[WIDE_STRIDED_LOAD1]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP20]]
+; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP21]]
+; RV64-UF2-NEXT: [[TMP24:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP25:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP24]]
+; RV64-UF2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP25]]
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE]], ptr [[TMP23]], align 4
+; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE2]], ptr [[TMP27]], align 4
+; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV64-UF2-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; RV64-UF2: [[MIDDLE_BLOCK]]:
; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; RV64-UF2: [[SCALAR_PH]]:
-; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
; RV64-UF2: [[FOR_BODY]]:
; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP31]], 1
+; RV64-UF2-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
+; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP29]], 1
; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
; RV64-UF2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -233,15 +236,16 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
-; RV64-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 4
-; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
-; RV64-NEXT: [[TMP13:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
-; RV64-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; RV64-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4
-; RV64-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP13]], ptr align 4 [[TMP15]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
+; RV64-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32
+; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; RV64-NEXT: [[TMP12:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
+; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; RV64-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP14]]
+; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP15]]
+; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP12]])
+; RV64-NEXT: store <vscale x 4 x float> [[REVERSE]], ptr [[TMP17]], align 4
; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -286,18 +290,20 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
; RV32-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
-; RV32-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; RV32-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 4
-; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr align 4 [[TMP10]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
-; RV32-NEXT: [[TMP13:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
-; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; RV32-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; RV32-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4
-; RV32-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i32(<vscale x 4 x float> [[TMP13]], ptr align 4 [[TMP15]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
+; RV32-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr align 4 [[TMP10]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; RV32-NEXT: [[TMP12:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
+; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; RV32-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT: [[TMP15:%.*]] = mul i32 0, [[TMP14]]
+; RV32-NEXT: [[TMP16:%.*]] = sub i32 1, [[TMP14]]
+; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP15]]
+; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP16]]
+; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP12]])
+; RV32-NEXT: store <vscale x 4 x float> [[REVERSE]], ptr [[TMP18]], align 4
; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV32-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV32-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; RV32: [[MIDDLE_BLOCK]]:
; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -308,8 +314,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP19]], 1.000000e+00
+; RV32-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
+; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP20]], 1.000000e+00
; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
; RV32-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
@@ -330,54 +336,54 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
; RV64-UF2: [[VECTOR_BODY]]:
; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
-; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; RV64-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], -1
-; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP12]]
-; RV64-UF2-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP9]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
-; RV64-UF2-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4
-; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP13]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
-; RV64-UF2-NEXT: [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT: [[TMP19:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD1]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
-; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; RV64-UF2-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4
-; RV64-UF2-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], -1
-; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP24]]
-; RV64-UF2-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP21]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP27]])
-; RV64-UF2-NEXT: [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
-; RV64-UF2-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], 4
-; RV64-UF2-NEXT: call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP19]], ptr align 4 [[TMP25]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP29]])
-; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
+; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1
+; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP13]]
+; RV64-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP5]] to i32
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; RV64-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP5]] to i32
+; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP14]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
+; RV64-UF2-NEXT: [[TMP17:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_STRIDED_LOAD1]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP20]]
+; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP21]]
+; RV64-UF2-NEXT: [[TMP24:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP25:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP24]]
+; RV64-UF2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP25]]
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP17]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE]], ptr [[TMP23]], align 4
+; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP18]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE2]], ptr [[TMP27]], align 4
+; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV64-UF2-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; RV64-UF2: [[MIDDLE_BLOCK]]:
; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; RV64-UF2: [[SCALAR_PH]]:
-; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
; RV64-UF2: [[FOR_BODY]]:
; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
+; RV64-UF2-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
+; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP29]], 1.000000e+00
; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
; RV64-UF2-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index eb2576c2acad2..5bf966a538682 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -41,7 +41,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -78,10 +78,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
-; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
-; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]>
; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
@@ -116,7 +116,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -141,7 +141,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 22
+; CHECK-NEXT: LV: Loop cost is 27
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
; CHECK-NEXT: LV: Not Interleaving.
@@ -150,7 +150,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK: Executing best plan with VF=vscale x 4, UF=1
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in ir<[[VF]]>.1 = VF * UF
; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: ir<%0> = original trip-count
; CHECK-EMPTY:
@@ -199,12 +200,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
-; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
-; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]>
; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -288,7 +289,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -325,10 +326,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
-; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
-; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]>
; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
@@ -363,7 +364,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -388,7 +389,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 24
+; CHECK-NEXT: LV: Loop cost is 29
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
; CHECK-NEXT: LV: Not Interleaving.
@@ -397,7 +398,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK: Executing best plan with VF=vscale x 4, UF=1
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in ir<[[VF]]>.1 = VF * UF
; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: ir<%0> = original trip-count
; CHECK-EMPTY:
@@ -446,12 +448,12 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
-; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
-; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]>
; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 11beb84802296..83319574e33a7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -22,36 +22,41 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
-; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[N_VEC]] to i32
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1
-; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP11]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], ptr align 4 [[TMP14]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], -1
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 0, [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = sub i64 1, [[TMP12]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP11]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP14]]
+; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE]], ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP17]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
-; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL: for.body:
; IF-EVL-NEXT: [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
; IF-EVL-NEXT: [[ADD]] = add i64 [[ADD_PHI]], -1
; IF-EVL-NEXT: [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]]
; IF-EVL-NEXT: [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
@@ -100,7 +105,6 @@ loopend:
}
define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noalias %ptr1, ptr noalias %ptr2) {
-;
; IF-EVL-LABEL: @reverse_load_store_masked(
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -123,22 +127,28 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[OFFSET_IDX1:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
-; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], -1
-; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX1]]
-; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
-; IF-EVL-NEXT: [[TMP11:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
-; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], -1
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX1]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP13]], i64 -4, <vscale x 4 x i1> [[TMP11]], i32 [[TMP7]])
-; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP8]]
-; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], ptr align 4 [[TMP15]], i64 -4, <vscale x 4 x i1> [[TMP11]], i32 [[TMP7]])
-; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64
-; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP16]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP17]], i64 -4, <vscale x 4 x i1> [[TMP14]], i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP19]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP19]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP20]]
+; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP21]]
+; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE]], ptr align 4 [[TMP23]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP24:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP24]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 41a60115433a2..c7c8ee4326a8b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -36,25 +36,30 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP12:%.*]] = sub nuw nsw i64 1, [[OFFSET_IDX]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
-; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i64(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[TMP14]], i64 -8, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]]
+; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP19]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[TMP17:%.*]] = sub nuw nsw i64 1, [[IV]]
-; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP17]]
-; CHECK-NEXT: store i64 0, ptr [[ARRAYIDX13]], align 8
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[TMP22:%.*]] = sub nuw nsw i64 1, [[IV1]]
+; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP22]]
+; CHECK-NEXT: store i64 0, ptr [[ARRAYIDX14]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index eb4fb9bf483e9..c1897e2c5d277 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1084,7 +1084,7 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
- VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {});
+ VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {});
EXPECT_TRUE(isa<VPUser>(&Recipe));
VPRecipeBase *BaseR = &Recipe;
EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1201,7 +1201,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
- VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {});
+ VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {});
EXPECT_FALSE(Recipe.mayHaveSideEffects());
EXPECT_TRUE(Recipe.mayReadFromMemory());
EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1215,8 +1215,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
- VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, false,
- {});
+ VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {});
EXPECT_TRUE(Recipe.mayHaveSideEffects());
EXPECT_FALSE(Recipe.mayReadFromMemory());
EXPECT_TRUE(Recipe.mayWriteToMemory());
>From 8fe1f5eeb01ec4530736a4b6a0bc0d580873c53d Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 2 Apr 2025 01:34:31 -0700
Subject: [PATCH 15/15] Introduce StrideInfo in CostMdoel
---
.../Transforms/Vectorize/LoopVectorize.cpp | 23 ++++++++++++++++---
1 file changed, 20 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 89a21e52b9fba..8d7a61cc8b50b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1351,6 +1351,16 @@ class LoopVectorizationCostModel {
/// that can be vectorized.
bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
+ /// Get the stride of the strided memory access instruction \p Instr. Return 0
+ /// if the instruction \p Instr is not considered for vectorization as a
+ /// strided memory access.
+ int64_t getStride(Instruction *Instr) const {
+ auto It = StrideInfo.find(Instr);
+ if (It != StrideInfo.end())
+ return It->second;
+ return 0;
+ }
+
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
bool requiresScalarEpilogue(bool IsVectorizing) const {
@@ -1763,6 +1773,9 @@ class LoopVectorizationCostModel {
Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
}
+ /// The mapping of memory access instructions to their stride values.
+ DenseMap<Instruction *, int64_t> StrideInfo;
+
public:
/// The loop that we evaluate.
Loop *TheLoop;
@@ -6183,6 +6196,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
if (StridedLoadStoreCost < Cost) {
Decision = CM_Strided;
Cost = StridedLoadStoreCost;
+ StrideInfo[&I] = ConsecutiveStride;
}
}
setWideningDecision(&I, VF, Decision, Cost);
@@ -8427,9 +8441,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
if (Strided) {
const DataLayout &DL = Load->getDataLayout();
auto *StrideTy = DL.getIndexType(Load->getPointerOperand()->getType());
- VPValue *Stride = Plan.getOrAddLiveIn(ConstantInt::get(
- StrideTy, -1 * DL.getTypeAllocSize(getLoadStoreType(Load))));
- return new VPWidenStridedLoadRecipe(*Load, Ptr, Stride, &Plan.getVF(),
+ int64_t Stride = CM.getStride(Load);
+ assert(Stride == -1 &&
+ "Only stride memory access with a stride of -1 is supported.");
+ VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
+ StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(Load))));
+ return new VPWidenStridedLoadRecipe(*Load, Ptr, StrideVPV, &Plan.getVF(),
Mask, I->getDebugLoc());
}
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
More information about the llvm-commits
mailing list