[llvm] [VPlan] Extract reverse operation for reverse accesses (PR #146525)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 5 02:10:09 PST 2025
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/146525
>From 2e6dfcde57ab6563e1e419b9ece7db7a26a3acbf Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 1 Jul 2025 06:11:39 -0700
Subject: [PATCH 01/13] [VPlan] Extract reverse operation for reverse accesses
---
.../Transforms/Vectorize/LoopVectorize.cpp | 23 +++--
.../Transforms/Vectorize/VPRecipeBuilder.h | 2 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +
.../Transforms/Vectorize/VPlanAnalysis.cpp | 1 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 51 ++++-------
.../Transforms/Vectorize/VPlanTransforms.cpp | 27 ++++++
.../AArch64/sve-vector-reverse-mask4.ll | 2 +-
.../AArch64/vector-reverse-mask4.ll | 2 +-
.../PowerPC/optimal-epilog-vectorization.ll | 28 +++---
.../RISCV/riscv-vector-reverse.ll | 52 +++++------
.../RISCV/tail-folding-reverse-load-store.ll | 11 ++-
.../RISCV/tail-folding-uniform-store.ll | 28 +++---
.../RISCV/vplan-riscv-vector-reverse.ll | 8 +-
.../LoopVectorize/X86/masked_load_store.ll | 90 +++++++++----------
.../interleave-with-i65-induction.ll | 2 +-
.../LoopVectorize/iv-select-cmp-decreasing.ll | 12 +--
.../reuse-lcssa-phi-scev-expansion.ll | 2 +-
.../LoopVectorize/reverse_induction.ll | 10 +--
.../single-early-exit-interleave.ll | 24 ++---
.../vplan-sink-scalars-and-merge.ll | 3 +-
20 files changed, 204 insertions(+), 176 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f78acf31a0de2..f1a2b338327e2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7586,8 +7586,8 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}
-VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
- VFRange &Range) {
+VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
+ VFRange &Range) {
assert((VPI->getOpcode() == Instruction::Load ||
VPI->getOpcode() == Instruction::Store) &&
"Must be called with either a load or store");
@@ -7648,15 +7648,26 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}
+
if (VPI->getOpcode() == Instruction::Load) {
auto *Load = cast<LoadInst>(I);
- return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, *VPI,
- VPI->getDebugLoc());
+ auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
+ *VPI, Load->getDebugLoc());
+ if (Reverse) {
+ Builder.insert(LoadR);
+ return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
+ LoadR->getDebugLoc());
+ }
+ return LoadR;
}
StoreInst *Store = cast<StoreInst>(I);
- return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask,
- Consecutive, Reverse, *VPI, VPI->getDebugLoc());
+ VPValue *StoredVal = VPI->getOperand(0);
+ if (Reverse)
+ StoredVal = Builder.createNaryOp(VPInstruction::Reverse, StoredVal,
+ Store->getDebugLoc());
+ return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
+ Reverse, *VPI, Store->getDebugLoc());
}
/// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 1808be118cd2a..59733143c31a2 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -92,7 +92,7 @@ class VPRecipeBuilder {
/// Check if the load or store instruction \p VPI should widened for \p
/// Range.Start and potentially masked. Such instructions are handled by a
/// recipe that takes an additional VPInstruction for the mask.
- VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
+ VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
/// Check if an induction recipe should be constructed for \p VPI. If so build
/// and return it. If not, return null.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fe60e97d44997..3b8b8e2f2aa00 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1111,6 +1111,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// Unrolling will add all copies of its original operand as additional
// operands.
LastActiveLane,
+ // Returns a reversed vector for the operand.
+ Reverse,
// The opcodes below are used for VPInstructionWithType.
//
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index ea38a8b16ebc7..5c5d44454d060 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -135,6 +135,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
+ case VPInstruction::Reverse:
// Return the type based on first operand.
return inferScalarType(R->getOperand(0));
case VPInstruction::BranchOnCond:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1b1308c78c76e..b9ba3474a6b8c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -444,6 +444,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::Not:
case VPInstruction::ResumeForEpilogue:
+ case VPInstruction::Reverse:
case VPInstruction::Unpack:
return 1;
case Instruction::ICmp:
@@ -917,6 +918,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
+ case VPInstruction::Reverse:
+ return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -1103,6 +1106,14 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
+ case VPInstruction::Reverse: {
+ assert(VF.isVector() && "Reverse operation must be vector type");
+ auto *VectorTy = cast<VectorType>(
+ toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
+ return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
+ VectorTy, /*Mask=*/{}, Ctx.CostKind,
+ /*Index=*/0);
+ }
case VPInstruction::ExtractLastElement: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
@@ -1206,6 +1217,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::WidePtrAdd:
case VPInstruction::StepVector:
case VPInstruction::ReductionStartVector:
+ case VPInstruction::Reverse:
case VPInstruction::VScale:
case VPInstruction::Unpack:
return false;
@@ -1383,6 +1395,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
+ case VPInstruction::Reverse:
+ O << "reverse";
+ break;
case VPInstruction::Unpack:
O << "unpack";
break;
@@ -3518,12 +3533,7 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
OpInfo, &Ingredient);
}
- if (!Reverse)
- return Cost;
-
- return Cost += Ctx.TTI.getShuffleCost(
- TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
- cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
+ return Cost;
}
void VPWidenLoadRecipe::execute(VPTransformState &State) {
@@ -3554,8 +3564,6 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
applyMetadata(*cast<Instruction>(NewLI));
- if (Reverse)
- NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
State.set(this, NewLI);
}
@@ -3610,8 +3618,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
applyMetadata(*NewLI);
Instruction *Res = NewLI;
- if (isReverse())
- Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
State.set(this, Res);
}
@@ -3628,15 +3634,9 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
- InstructionCost Cost = Ctx.TTI.getMemIntrinsicInstrCost(
+ return Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
Ctx.CostKind);
- if (!Reverse)
- return Cost;
-
- return Cost + Ctx.TTI.getShuffleCost(
- TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
- cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3665,13 +3665,6 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
}
Value *StoredVal = State.get(StoredVPValue);
- if (isReverse()) {
- // If we store to reverse consecutive memory locations, then we need
- // to reverse the order of elements in the stored value.
- StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
- // We don't want to update the value in the map as it might be used in
- // another expression. So don't call resetVectorValue(StoredVal).
- }
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
Instruction *NewSI = nullptr;
if (CreateScatter)
@@ -3700,8 +3693,6 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
- if (isReverse())
- StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
Mask = State.get(VPMask);
@@ -3738,15 +3729,9 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
- InstructionCost Cost = Ctx.TTI.getMemIntrinsicInstrCost(
+ return Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
Ctx.CostKind);
- if (!Reverse)
- return Cost;
-
- return Cost + Ctx.TTI.getShuffleCost(
- TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
- cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a59c8cf9ea1ef..75a2ee1a0869c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2849,6 +2849,32 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return nullptr;
}
+static void convertToEVLReverse(VPlan &Plan, VPTypeAnalysis &TypeInfo,
+ VPValue &EVL) {
+ SmallVector<VPRecipeBase *> ToRemove;
+
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+ for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
+ auto *VPI = dyn_cast<VPInstruction>(&R);
+ if (!VPI || VPI->getOpcode() != VPInstruction::Reverse)
+ continue;
+
+ SmallVector<VPValue *> Ops(VPI->operands());
+ Ops.append({Plan.getTrue(), &EVL});
+ auto *NewReverse = new VPWidenIntrinsicRecipe(
+ Intrinsic::experimental_vp_reverse, Ops,
+ TypeInfo.inferScalarType(VPI), {}, {}, VPI->getDebugLoc());
+ NewReverse->insertBefore(VPI);
+ VPI->replaceAllUsesWith(NewReverse);
+ ToRemove.push_back(VPI);
+ }
+ }
+
+ for (VPRecipeBase *R : ToRemove)
+ R->eraseFromParent();
+}
+
/// Replace recipes with their EVL variants.
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPTypeAnalysis TypeInfo(Plan);
@@ -2964,6 +2990,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
ToErase.push_back(CurRecipe);
}
+ convertToEVLReverse(Plan, TypeInfo, EVL);
// Remove dead EVL mask.
if (EVLMask->getNumUsers() == 0)
ToErase.push_back(EVLMask->getDefiningRecipe());
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
index c3f7a251e37fa..7f3b28422e47b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
@@ -22,8 +22,8 @@ define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 {
; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr align 8 %{{.*}}, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[REVERSE7]]
-; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
+; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr align 8 %{{.*}}, <vscale x 4 x i1> %[[REVERSE9]]
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
index 17be6cc362a2c..f0aa75aab1f91 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
@@ -37,8 +37,8 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -24
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -56
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP3]], align 8
-; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 8
+; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x double> [[WIDE_LOAD1]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP5:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = fcmp une <4 x double> [[REVERSE2]], zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
index f1fbf1dd5d942..850947addf9c4 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
@@ -278,20 +278,20 @@ define void @f2(ptr noalias %A, ptr noalias %B, i32 %n) {
; VF-TWO-CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i64 -28
; VF-TWO-CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 -3
; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP57]], align 4
-; VF-TWO-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP59]], align 4
-; VF-TWO-CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-TWO-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP61]], align 4
-; VF-TWO-CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-TWO-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP63]], align 4
-; VF-TWO-CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP65]], align 4
-; VF-TWO-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-TWO-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP67]], align 4
-; VF-TWO-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-TWO-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP69]], align 4
-; VF-TWO-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-TWO-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP71]], align 4
+; VF-TWO-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-TWO-CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-TWO-CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-TWO-CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-TWO-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-TWO-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-TWO-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-TWO-CHECK-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-TWO-CHECK-NEXT: [[TMP72:%.*]] = fadd fast <4 x float> [[REVERSE]], splat (float 1.000000e+00)
; VF-TWO-CHECK-NEXT: [[TMP73:%.*]] = fadd fast <4 x float> [[REVERSE3]], splat (float 1.000000e+00)
@@ -401,20 +401,20 @@ define void @f2(ptr noalias %A, ptr noalias %B, i32 %n) {
; VF-FOUR-CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i64 -28
; VF-FOUR-CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 -3
; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP57]], align 4
-; VF-FOUR-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP59]], align 4
-; VF-FOUR-CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP61]], align 4
-; VF-FOUR-CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP63]], align 4
-; VF-FOUR-CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP65]], align 4
-; VF-FOUR-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP67]], align 4
-; VF-FOUR-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP69]], align 4
-; VF-FOUR-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP71]], align 4
+; VF-FOUR-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-FOUR-CHECK-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF-FOUR-CHECK-NEXT: [[TMP72:%.*]] = fadd fast <4 x float> [[REVERSE]], splat (float 1.000000e+00)
; VF-FOUR-CHECK-NEXT: [[TMP73:%.*]] = fadd fast <4 x float> [[REVERSE3]], splat (float 1.000000e+00)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index f2f65685e9bad..0ac5734458002 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -39,13 +39,13 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
+; RV64-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: [[TMP16:%.*]] = zext i32 [[TMP19]] to i64
; RV64-NEXT: [[TMP25:%.*]] = mul i64 0, [[TMP16]]
; RV64-NEXT: [[TMP17:%.*]] = sub i64 [[TMP16]], 1
; RV64-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP17]]
; RV64-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP25]]
; RV64-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[TMP20]], i64 [[TMP18]]
-; RV64-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE1]], ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: [[TMP22:%.*]] = zext i32 [[TMP19]] to i64
; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]]
@@ -79,12 +79,12 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
+; RV32-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: [[TMP17:%.*]] = mul i32 0, [[TMP9]]
; RV32-NEXT: [[TMP19:%.*]] = sub i32 [[TMP9]], 1
; RV32-NEXT: [[TMP20:%.*]] = mul i32 -1, [[TMP19]]
; RV32-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP16]], i32 [[TMP17]]
; RV32-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP18]], i32 [[TMP20]]
-; RV32-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE1]], ptr align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: [[TMP23:%.*]] = zext i32 [[TMP9]] to i64
; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP23]], [[INDEX]]
@@ -127,8 +127,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP15]]
; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]]
; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD1]])
; RV64-UF2-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV64-UF2-NEXT: [[TMP21:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
@@ -144,8 +144,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP28]]
; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i64 [[TMP30]]
; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP27]], align 4
; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP27]], align 4
; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP32]], align 4
; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; RV64-UF2-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP7]]
@@ -224,24 +224,24 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: [[TMP29:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
+; RV64-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: [[TMP39:%.*]] = zext i32 [[TMP20]] to i64
; RV64-NEXT: [[TMP31:%.*]] = mul i64 0, [[TMP39]]
; RV64-NEXT: [[TMP32:%.*]] = sub i64 [[TMP39]], 1
; RV64-NEXT: [[TMP33:%.*]] = mul i64 -1, [[TMP32]]
; RV64-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[TMP30]], i64 [[TMP31]]
; RV64-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[TMP34]], i64 [[TMP33]]
-; RV64-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP35]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: [[TMP36:%.*]] = zext i32 [[TMP20]] to i64
; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP36]], [[INDEX]]
; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP36]]
; RV64-NEXT: [[TMP37:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RV64-NEXT: br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV64-NEXT: br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; RV64: [[MIDDLE_BLOCK]]:
; RV64-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
; RV64: [[SCALAR_PH]]:
; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
-; RV64-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV64-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
; RV64-NEXT: br label %[[FOR_BODY:.*]]
; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; RV64-NEXT: br label %[[FOR_COND_CLEANUP]]
@@ -286,18 +286,18 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: [[TMP22:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
+; RV32-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: [[TMP21:%.*]] = mul i32 0, [[TMP16]]
; RV32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP16]], 1
; RV32-NEXT: [[TMP27:%.*]] = mul i32 -1, [[TMP26]]
; RV32-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP23]], i32 [[TMP21]]
; RV32-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i32 [[TMP27]]
-; RV32-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP25]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: [[TMP29:%.*]] = zext i32 [[TMP16]] to i64
; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP29]], [[INDEX]]
; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP29]]
; RV32-NEXT: [[TMP30:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; RV32: [[MIDDLE_BLOCK]]:
; RV32-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
; RV32: [[SCALAR_PH]]:
@@ -365,8 +365,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV64-UF2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP30]]
; RV64-UF2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i64 [[TMP32]]
; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP29]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
; RV64-UF2-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i32>, ptr [[TMP34]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD4]])
; RV64-UF2-NEXT: [[TMP35:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV64-UF2-NEXT: [[TMP36:%.*]] = add <vscale x 4 x i32> [[REVERSE5]], splat (i32 1)
@@ -382,8 +382,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i64 [[TMP43]]
; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP46]], i64 [[TMP45]]
; RV64-UF2-NEXT: [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP35]])
-; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE6]], ptr [[TMP42]], align 4
; RV64-UF2-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP36]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE6]], ptr [[TMP42]], align 4
; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE7]], ptr [[TMP47]], align 4
; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
; RV64-UF2-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP20]]
@@ -475,24 +475,24 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: [[TMP29:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]]
+; RV64-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: [[TMP39:%.*]] = zext i32 [[TMP20]] to i64
; RV64-NEXT: [[TMP31:%.*]] = mul i64 0, [[TMP39]]
; RV64-NEXT: [[TMP32:%.*]] = sub i64 [[TMP39]], 1
; RV64-NEXT: [[TMP33:%.*]] = mul i64 -1, [[TMP32]]
; RV64-NEXT: [[TMP34:%.*]] = getelementptr float, ptr [[TMP30]], i64 [[TMP31]]
; RV64-NEXT: [[TMP35:%.*]] = getelementptr float, ptr [[TMP34]], i64 [[TMP33]]
-; RV64-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE3]], ptr align 4 [[TMP35]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
; RV64-NEXT: [[TMP36:%.*]] = zext i32 [[TMP20]] to i64
; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP36]], [[INDEX]]
; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP36]]
; RV64-NEXT: [[TMP37:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RV64-NEXT: br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RV64-NEXT: br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; RV64: [[MIDDLE_BLOCK]]:
; RV64-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
; RV64: [[SCALAR_PH]]:
; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
-; RV64-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV64-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
; RV64-NEXT: br label %[[FOR_BODY:.*]]
; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; RV64-NEXT: br label %[[FOR_COND_CLEANUP]]
@@ -537,18 +537,18 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: [[TMP22:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
+; RV32-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: [[TMP21:%.*]] = mul i32 0, [[TMP16]]
; RV32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP16]], 1
; RV32-NEXT: [[TMP27:%.*]] = mul i32 -1, [[TMP26]]
; RV32-NEXT: [[TMP24:%.*]] = getelementptr float, ptr [[TMP23]], i32 [[TMP21]]
; RV32-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[TMP24]], i32 [[TMP27]]
-; RV32-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE3]], ptr align 4 [[TMP25]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP16]])
; RV32-NEXT: [[TMP29:%.*]] = zext i32 [[TMP16]] to i64
; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP29]], [[INDEX]]
; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP29]]
; RV32-NEXT: [[TMP30:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; RV32: [[MIDDLE_BLOCK]]:
; RV32-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
; RV32: [[SCALAR_PH]]:
@@ -616,8 +616,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV64-UF2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP30]]
; RV64-UF2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP33]], i64 [[TMP32]]
; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP29]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
; RV64-UF2-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP34]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD4]])
; RV64-UF2-NEXT: [[TMP35:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV64-UF2-NEXT: [[TMP36:%.*]] = fadd <vscale x 4 x float> [[REVERSE5]], splat (float 1.000000e+00)
@@ -633,8 +633,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP37]], i64 [[TMP43]]
; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP46]], i64 [[TMP45]]
; RV64-UF2-NEXT: [[REVERSE6:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP35]])
-; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE6]], ptr [[TMP42]], align 4
; RV64-UF2-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP36]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE6]], ptr [[TMP42]], align 4
; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE7]], ptr [[TMP47]], align 4
; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
; RV64-UF2-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP20]]
@@ -702,19 +702,19 @@ define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; RV64-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: [[TMP16:%.*]] = zext i32 [[TMP19]] to i64
; RV64-NEXT: [[TMP25:%.*]] = mul i64 0, [[TMP16]]
; RV64-NEXT: [[TMP17:%.*]] = sub i64 [[TMP16]], 1
; RV64-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP17]]
; RV64-NEXT: [[TMP20:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[TMP25]]
; RV64-NEXT: [[TMP21:%.*]] = getelementptr float, ptr [[TMP20]], i64 [[TMP18]]
-; RV64-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE1]], ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
; RV64-NEXT: [[TMP22:%.*]] = zext i32 [[TMP19]] to i64
; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]]
; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
; RV64-NEXT: [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RV64-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; RV64-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; RV64: [[MIDDLE_BLOCK]]:
; RV64-NEXT: br label %[[EXIT:.*]]
; RV64: [[EXIT]]:
@@ -742,18 +742,18 @@ define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; RV32-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: [[TMP17:%.*]] = mul i32 0, [[TMP9]]
; RV32-NEXT: [[TMP19:%.*]] = sub i32 [[TMP9]], 1
; RV32-NEXT: [[TMP20:%.*]] = mul i32 -1, [[TMP19]]
; RV32-NEXT: [[TMP18:%.*]] = getelementptr float, ptr [[TMP16]], i32 [[TMP17]]
; RV32-NEXT: [[TMP22:%.*]] = getelementptr float, ptr [[TMP18]], i32 [[TMP20]]
-; RV32-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE1]], ptr align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; RV32-NEXT: [[TMP23:%.*]] = zext i32 [[TMP9]] to i64
; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP23]], [[INDEX]]
; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP23]]
; RV32-NEXT: [[TMP21:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RV32-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; RV32-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; RV32: [[MIDDLE_BLOCK]]:
; RV32-NEXT: br label %[[EXIT:.*]]
; RV32: [[EXIT]]:
@@ -790,8 +790,8 @@ define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP15]]
; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]]
; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP19]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1]])
; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV64-UF2-NEXT: [[TMP21:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
@@ -807,8 +807,8 @@ define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]]
; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP31]], i64 [[TMP30]]
; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP20]])
-; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP27]], align 4
; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP21]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP27]], align 4
; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP32]], align 4
; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; RV64-UF2-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP7]]
@@ -884,7 +884,7 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1
; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; RV64-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
-; RV64-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; RV64-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; RV64: [[MIDDLE_BLOCK]]:
; RV64-NEXT: br label %[[SCALAR_PH:.*]]
; RV64: [[SCALAR_PH]]:
@@ -935,7 +935,7 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1
; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; RV32-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
-; RV32-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; RV32-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; RV32: [[MIDDLE_BLOCK]]:
; RV32-NEXT: br label %[[SCALAR_PH:.*]]
; RV32: [[SCALAR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
index 13990000585ea..8a96a7f0b2d9d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
@@ -29,13 +29,13 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
+; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 [[TMP19]], 1
; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP23]]
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
-; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
@@ -151,13 +151,13 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP27]], 1
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP30]]
; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
@@ -207,8 +207,8 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; NO-VP-NEXT: [[TMP20:%.*]] = mul i64 -1, [[TMP19]]
; NO-VP-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP18]]
; NO-VP-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP20]]
-; NO-VP-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
; NO-VP-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE2]])
+; NO-VP-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE4]], ptr align 4 [[TMP22]], <vscale x 4 x i1> [[REVERSE3]])
; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; NO-VP-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -295,13 +295,13 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]]
+; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]]
; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP16]], 1
; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP30]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[OFFSET_IDX]]
; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
@@ -310,8 +310,7 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP24:%.*]] = mul i64 -1, [[TMP31]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
-; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE2]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP27]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
index f804329169fe0..ad0cc57deb8a7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
@@ -15,27 +15,27 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 3, [[SPEC_SELECT]]
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer)
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP0]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP12:%.*]] = sub nuw nsw i64 1, [[OFFSET_IDX]]
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
-; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP15]]
-; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[TMP15]], 1
-; CHECK-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP16]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]]
-; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP19]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
-; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]]
-; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 0, [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 -1, [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 [[TMP7]]
+; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[REVERSE]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll
index 46695221c27db..8b6ce538474c7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll
@@ -34,11 +34,13 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<[[IDX_PROM:%.+]]> = zext ir<[[IDX]]>
; CHECK-NEXT: CLONE ir<[[ARRAY_IDX_B:%.+]]> = getelementptr inbounds ir<[[B:%.+]]>, ir<[[IDX_PROM]]>
; CHECK-NEXT: vp<[[VEC_END_PTR_B:%.+]]> = vector-end-pointer ir<[[ARRAY_IDX_B]]>, vp<[[EVL]]>
-; CHECK-NEXT: WIDEN ir<[[VAL_B:%.+]]> = vp.load vp<[[VEC_END_PTR_B]]>, vp<[[EVL]]>
-; CHECK-NEXT: WIDEN ir<[[ADD_RESULT:%.+]]> = add ir<[[VAL_B]]>, ir<1>
+; CHECK-NEXT: WIDEN ir<[[LOAD_B:%.+]]> = vp.load vp<[[VEC_END_PTR_B]]>, vp<[[EVL]]>
+; CHECK-NEXT: WIDEN-INTRINSIC vp<[[VAL_B:%.+]]> = call llvm.experimental.vp.reverse(ir<[[LOAD_B]]>, ir<true>, vp<[[EVL]]>)
+; CHECK-NEXT: WIDEN ir<[[ADD_RESULT:%.+]]> = add vp<[[VAL_B]]>, ir<1>
; CHECK-NEXT: CLONE ir<[[ARRAY_IDX_A:%.+]]> = getelementptr inbounds ir<[[A:%.+]]>, ir<[[IDX_PROM]]>
+; CHECK-NEXT: WIDEN-INTRINSIC vp<[[STORE_VAL:%.+]]> = call llvm.experimental.vp.reverse(ir<[[ADD_RESULT]]>, ir<true>, vp<[[EVL]]>)
; CHECK-NEXT: vp<[[VEC_END_PTR_A:%.+]]> = vector-end-pointer ir<[[ARRAY_IDX_A]]>, vp<[[EVL]]>
-; CHECK-NEXT: WIDEN vp.store vp<[[VEC_END_PTR_A]]>, ir<[[ADD_RESULT]]>, vp<[[EVL]]>
+; CHECK-NEXT: WIDEN vp.store vp<[[VEC_END_PTR_A]]>, vp<[[STORE_VAL]]>, vp<[[EVL]]>
; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[EVL]]>, vp<[[EVL_PHI]]>
; CHECK-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[EVL]]>
; CHECK-NEXT: EMIT vp<[[INDEX_NEXT]]> = add vp<[[INDUCTION]]>, vp<[[VFxUF]]>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index e4977ee642b09..1808e80a97060 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -1126,12 +1126,12 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 -12
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 -3
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META22:![0-9]+]]
-; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META22]]
-; AVX2-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD6]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META22]]
-; AVX2-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD8]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4, !alias.scope [[META22]]
+; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD6]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD8]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[REVERSE7]], zeroinitializer
@@ -1147,21 +1147,21 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
; AVX2-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[TMP14]], i64 -12
; AVX2-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP20]], i64 -3
; AVX2-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP15]], <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META25:![0-9]+]]
-; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP17]], <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope [[META25]]
-; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP19]], <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META25]]
-; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP21]], <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope [[META25]]
+; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP15]], <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META25:![0-9]+]]
+; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP17]], <4 x i1> [[REVERSE13]], <4 x double> poison), !alias.scope [[META25]]
+; AVX2-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP19]], <4 x i1> [[REVERSE15]], <4 x double> poison), !alias.scope [[META25]]
+; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP21]], <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META25]]
; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[TMP23:%.*]] = fadd <4 x double> [[REVERSE13]], splat (double 5.000000e-01)
+; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD14]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD16]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: [[TMP23:%.*]] = fadd <4 x double> [[REVERSE22]], splat (double 5.000000e-01)
; AVX2-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[REVERSE16]], splat (double 5.000000e-01)
; AVX2-NEXT: [[TMP25:%.*]] = fadd <4 x double> [[REVERSE19]], splat (double 5.000000e-01)
-; AVX2-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[REVERSE22]], splat (double 5.000000e-01)
+; AVX2-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[REVERSE23]], splat (double 5.000000e-01)
; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[OUT]], i64 [[OFFSET_IDX]]
; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[TMP27]], i64 0
; AVX2-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[TMP35]], i64 -3
@@ -1172,13 +1172,13 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
; AVX2-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[TMP27]], i64 -12
; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[TMP33]], i64 -3
; AVX2-NEXT: [[REVERSE24:%.*]] = shufflevector <4 x double> [[TMP23]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: [[REVERSE27:%.*]] = shufflevector <4 x double> [[TMP24]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> [[TMP25]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x double> [[TMP26]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE24]], ptr align 8 [[TMP28]], <4 x i1> [[REVERSE12]]), !alias.scope [[META27:![0-9]+]], !noalias [[META29:![0-9]+]]
-; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x double> [[TMP24]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE26]], ptr align 8 [[TMP30]], <4 x i1> [[REVERSE14]]), !alias.scope [[META27]], !noalias [[META29]]
-; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[TMP25]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE28]], ptr align 8 [[TMP32]], <4 x i1> [[REVERSE17]]), !alias.scope [[META27]], !noalias [[META29]]
-; AVX2-NEXT: [[REVERSE30:%.*]] = shufflevector <4 x double> [[TMP26]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE30]], ptr align 8 [[TMP34]], <4 x i1> [[REVERSE20]]), !alias.scope [[META27]], !noalias [[META29]]
+; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE27]], ptr align 8 [[TMP30]], <4 x i1> [[REVERSE13]]), !alias.scope [[META27]], !noalias [[META29]]
+; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE25]], ptr align 8 [[TMP32]], <4 x i1> [[REVERSE15]]), !alias.scope [[META27]], !noalias [[META29]]
+; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE26]], ptr align 8 [[TMP34]], <4 x i1> [[REVERSE17]]), !alias.scope [[META27]], !noalias [[META29]]
; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; AVX2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
; AVX2-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
@@ -1217,13 +1217,13 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 -24
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 -7
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META34:![0-9]+]]
-; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META34]]
+; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META34]]
+; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4, !alias.scope [[META34]]
+; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[REVERSE7:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD6]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META34]]
-; AVX512-NEXT: [[REVERSE9:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD8]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4, !alias.scope [[META34]]
-; AVX512-NEXT: [[REVERSE11:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD10]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: [[REVERSE9:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD7]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: [[REVERSE11:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD8]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP10:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <8 x i32> [[REVERSE7]], zeroinitializer
; AVX512-NEXT: [[TMP12:%.*]] = icmp sgt <8 x i32> [[REVERSE9]], zeroinitializer
@@ -1239,20 +1239,20 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP20]], i64 -7
; AVX512-NEXT: [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP15]], <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META37:![0-9]+]]
-; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP17]], <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope [[META37]]
-; AVX512-NEXT: [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP19]], <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META37]]
-; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP21]], <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope [[META37]]
-; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[TMP23:%.*]] = fadd <8 x double> [[REVERSE13]], splat (double 5.000000e-01)
+; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP17]], <8 x i1> [[REVERSE13]], <8 x double> poison), !alias.scope [[META37]]
+; AVX512-NEXT: [[REVERSE15:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP19]], <8 x i1> [[REVERSE15]], <8 x double> poison), !alias.scope [[META37]]
+; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP21]], <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META37]]
+; AVX512-NEXT: [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD14]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD16]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP24:%.*]] = fadd <8 x double> [[REVERSE16]], splat (double 5.000000e-01)
; AVX512-NEXT: [[TMP25:%.*]] = fadd <8 x double> [[REVERSE19]], splat (double 5.000000e-01)
-; AVX512-NEXT: [[TMP26:%.*]] = fadd <8 x double> [[REVERSE22]], splat (double 5.000000e-01)
+; AVX512-NEXT: [[TMP26:%.*]] = fadd <8 x double> [[REVERSE23]], splat (double 5.000000e-01)
+; AVX512-NEXT: [[TMP23:%.*]] = fadd <8 x double> [[REVERSE22]], splat (double 5.000000e-01)
; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[OUT]], i64 [[OFFSET_IDX]]
; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[TMP27]], i64 0
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[TMP35]], i64 -7
@@ -1262,14 +1262,14 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double, ptr [[TMP31]], i64 -7
; AVX512-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[TMP27]], i64 -24
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[TMP33]], i64 -7
-; AVX512-NEXT: [[REVERSE24:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr align 8 [[TMP28]], <8 x i1> [[REVERSE12]]), !alias.scope [[META39:![0-9]+]], !noalias [[META41:![0-9]+]]
; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr align 8 [[TMP30]], <8 x i1> [[REVERSE14]]), !alias.scope [[META39]], !noalias [[META41]]
-; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x double> [[TMP25]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE28]], ptr align 8 [[TMP32]], <8 x i1> [[REVERSE17]]), !alias.scope [[META39]], !noalias [[META41]]
-; AVX512-NEXT: [[REVERSE30:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE30]], ptr align 8 [[TMP34]], <8 x i1> [[REVERSE20]]), !alias.scope [[META39]], !noalias [[META41]]
+; AVX512-NEXT: [[REVERSE27:%.*]] = shufflevector <8 x double> [[TMP25]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: [[REVERSE24:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr align 8 [[TMP28]], <8 x i1> [[REVERSE12]]), !alias.scope [[META39:![0-9]+]], !noalias [[META41:![0-9]+]]
+; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE27]], ptr align 8 [[TMP30]], <8 x i1> [[REVERSE13]]), !alias.scope [[META39]], !noalias [[META41]]
+; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE25]], ptr align 8 [[TMP32]], <8 x i1> [[REVERSE15]]), !alias.scope [[META39]], !noalias [[META41]]
+; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr align 8 [[TMP34]], <8 x i1> [[REVERSE17]]), !alias.scope [[META39]], !noalias [[META41]]
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
; AVX512-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll b/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll
index ffe9da09ca680..86dfd70ea754f 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll
@@ -24,8 +24,8 @@ define void @i65_induction_with_negative_step(ptr %dst) {
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 -4
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 -3
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: store <4 x i64> [[REVERSE]], ptr [[TMP6]], align 8
; CHECK-NEXT: [[REVERSE1:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: store <4 x i64> [[REVERSE]], ptr [[TMP6]], align 8
; CHECK-NEXT: store <4 x i64> [[REVERSE1]], ptr [[TMP8]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i65 [[INDEX]], 8
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 -4)
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index 2200a7d0431d2..75f0017c86c9a 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -61,12 +61,12 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) {
; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 -12
; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 -3
; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
-; IC4VF4-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
-; IC4VF4-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD8]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i64> [[REVERSE]], splat (i64 3)
; IC4VF4-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i64> [[REVERSE5]], splat (i64 3)
@@ -867,12 +867,12 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) {
; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 -12
; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 -3
; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
-; IC4VF4-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
-; IC4VF4-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD8]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i64> [[REVERSE]], splat (i64 3)
; IC4VF4-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i64> [[REVERSE5]], splat (i64 3)
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index a2649053680d2..4d8b46cc6fdeb 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -41,8 +41,8 @@ define void @reuse_lcssa_phi_for_add_rec1(ptr %head) {
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP7]], align 8
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x ptr> [[WIDE_LOAD]], <2 x ptr> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x ptr> [[REVERSE]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[REVERSE]], i32 1
; CHECK-NEXT: store ptr null, ptr [[TMP8]], align 8
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[REVERSE]], i32 1
; CHECK-NEXT: store ptr null, ptr [[TMP9]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
index d3e291e4f3ed2..2eb79c405e528 100644
--- a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
@@ -25,8 +25,8 @@ define i32 @reverse_induction_i64(i64 %startval, ptr %ptr) {
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 -4
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 -3
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
-; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4
+; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP10]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]]
; CHECK-NEXT: [[TMP11]] = add <4 x i32> [[REVERSE4]], [[VEC_PHI2]]
@@ -79,8 +79,8 @@ define i32 @reverse_induction_i128(i128 %startval, ptr %ptr) {
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 -4
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 -3
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
-; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4
+; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP10]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]]
; CHECK-NEXT: [[TMP11]] = add <4 x i32> [[REVERSE4]], [[VEC_PHI2]]
@@ -139,8 +139,8 @@ define i32 @reverse_induction_i16(i16 %startval, ptr %ptr) {
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 -4
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 -3
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4
-; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP14]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]]
; CHECK-NEXT: [[TMP15]] = add <4 x i32> [[REVERSE4]], [[VEC_PHI2]]
@@ -226,8 +226,8 @@ define void @reverse_forward_induction_i64_i8() {
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 -4
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 -3
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: store <4 x i32> [[REVERSE]], ptr [[TMP9]], align 4
; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: store <4 x i32> [[REVERSE]], ptr [[TMP9]], align 4
; CHECK-NEXT: store <4 x i32> [[REVERSE2]], ptr [[TMP11]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[STEP_ADD]], splat (i8 4)
@@ -279,8 +279,8 @@ define void @reverse_forward_induction_i64_i8_signed() {
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 -4
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 -3
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: store <4 x i32> [[REVERSE]], ptr [[TMP9]], align 4
; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: store <4 x i32> [[REVERSE]], ptr [[TMP9]], align 4
; CHECK-NEXT: store <4 x i32> [[REVERSE2]], ptr [[TMP11]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[STEP_ADD]], splat (i8 4)
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
index bdf73d6a52c22..f08b6e69e3e1c 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
@@ -578,12 +578,12 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
; VF4IC4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 -12
; VF4IC4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i64 -3
; VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
-; VF4IC4-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
-; VF4IC4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
-; VF4IC4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP27]], align 1
+; VF4IC4-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
; VF4IC4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i64 0
@@ -595,12 +595,12 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
; VF4IC4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i64 -12
; VF4IC4-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i64 -3
; VF4IC4-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i8>, ptr [[TMP46]], align 1
-; VF4IC4-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD14]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
-; VF4IC4-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
-; VF4IC4-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i8>, ptr [[TMP42]], align 1
+; VF4IC4-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD14]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD15]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE14]], [[REVERSE15]]
; VF4IC4-NEXT: [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE2]], [[REVERSE10]]
@@ -816,12 +816,12 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
; VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 -12
; VF4IC4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 -3
; VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
-; VF4IC4-NEXT: [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
-; VF4IC4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
-; VF4IC4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; VF4IC4-NEXT: [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD6]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
; VF4IC4-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 0
@@ -833,12 +833,12 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
; VF4IC4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 -12
; VF4IC4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i64 -3
; VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
-; VF4IC4-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
-; VF4IC4-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
-; VF4IC4-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; VF4IC4-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD14]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; VF4IC4-NEXT: [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE14]]
; VF4IC4-NEXT: [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE2]], [[REVERSE10]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 88dead4418628..9c4c9534157c7 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -1159,7 +1159,8 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) {
; CHECK-NEXT: CLONE ir<%ptr.iv.next> = getelementptr inbounds vp<[[PTR_IV]]>, ir<-1>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%ptr.iv.next>, vp<[[VF]]>
; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%c.1> = icmp ne ir<%l>, ir<0>
+; CHECK-NEXT: EMIT vp<%9> = reverse ir<%l>
+; CHECK-NEXT: WIDEN ir<%c.1> = icmp ne vp<%9>, ir<0>
; CHECK-NEXT: Successor(s): pred.store
; CHECK-EMPTY:
; CHECK-NEXT: <xVFxUF> pred.store: {
>From 804203aa22c84f307f1324470d80eae461e18a8d Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 14 Aug 2025 00:28:16 -0700
Subject: [PATCH 02/13] fix TTI::CastContextHint
---
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 28 +++++++++++++++----
1 file changed, 22 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b9ba3474a6b8c..23ecda9d85c58 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2271,21 +2271,37 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
return TTI::CastContextHint::Normal;
};
+ using namespace llvm::VPlanPatternMatch;
VPValue *Operand = getOperand(0);
TTI::CastContextHint CCH = TTI::CastContextHint::None;
// For Trunc/FPTrunc, get the context from the only user.
- if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
- !hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
- if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
- CCH = ComputeCCH(StoreRecipe);
+ if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
+ auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
+ if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
+ return nullptr;
+ return dyn_cast<VPRecipeBase>(*R->user_begin());
+ };
+
+ if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
+ if (match(Recipe, m_VPInstruction<VPInstruction::Reverse>(m_VPValue())))
+ Recipe = GetOnlyUser(cast<VPInstruction>(Recipe));
+ if (Recipe)
+ CCH = ComputeCCH(Recipe);
+ }
}
// For Z/Sext, get the context from the operand.
else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
Opcode == Instruction::FPExt) {
if (Operand->isLiveIn())
CCH = TTI::CastContextHint::Normal;
- else if (Operand->getDefiningRecipe())
- CCH = ComputeCCH(Operand->getDefiningRecipe());
+ else if (auto *Recipe = Operand->getDefiningRecipe()) {
+ VPValue *ReverseOp;
+ if (match(Recipe,
+ m_VPInstruction<VPInstruction::Reverse>(m_VPValue(ReverseOp))))
+ Recipe = ReverseOp->getDefiningRecipe();
+ if (Recipe)
+ CCH = ComputeCCH(Recipe);
+ }
}
auto *SrcTy =
>From d7cb3afae1030ca2b2d416378932c04c3535f0d9 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 25 Aug 2025 20:10:54 -0700
Subject: [PATCH 03/13] Change planContainsAdditionalSimplifications for licm
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f1a2b338327e2..f47e082cb2a0f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7075,6 +7075,23 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
cast<VPRecipeWithIRFlags>(R).getPredicate() !=
cast<CmpInst>(UI)->getPredicate())
return true;
+
+ if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
+ if (MemR->isReverse()) {
+ // If the stored value of a reverse store is invariant, LICM will
+ // hoist the reverse operation to the preheader. In this case, the
+ // result of the VPlan-based cost model will diverge from that of
+ // the legacy model.
+ if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemR))
+ if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
+ return true;
+
+ if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(MemR))
+ if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
+ return true;
+ }
+ }
+
SeenInstrs.insert(UI);
}
}
>From bfd9e187347ae2f71d5e9197e3d6e52ddd037db2 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 26 Aug 2025 03:24:49 -0700
Subject: [PATCH 04/13] Change the way to convert reverse operation When EVL
lowering.
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 80 ++++++++++++-------
1 file changed, 53 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 75a2ee1a0869c..e16a406f22964 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2766,6 +2766,22 @@ static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
return RemoveMask_match<Op0_t, Op1_t>(In, Out);
}
+/// If \p R is a VPInstruction::Reverse, return a VPWidenIntrinsicRecipe
+/// for the vp.reverse intrinsic using \p EVL. Returns nullptr otherwise.
+static VPWidenIntrinsicRecipe *
+getEVLReverse(VPRecipeBase &R, VPTypeAnalysis &TypeInfo, VPValue &EVL) {
+ VPValue *ReversedVal;
+ if (!match(&R,
+ m_VPInstruction<VPInstruction::Reverse>(m_VPValue(ReversedVal))))
+ return nullptr;
+
+ auto *Reverse = cast<VPInstruction>(&R);
+ VPlan *Plan = Reverse->getParent()->getPlan();
+ return new VPWidenIntrinsicRecipe(
+ Intrinsic::experimental_vp_reverse, {ReversedVal, Plan->getTrue(), &EVL},
+ TypeInfo.inferScalarType(Reverse), {}, {}, Reverse->getDebugLoc());
+}
+
/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
/// recipe could be created.
@@ -2849,32 +2865,6 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return nullptr;
}
-static void convertToEVLReverse(VPlan &Plan, VPTypeAnalysis &TypeInfo,
- VPValue &EVL) {
- SmallVector<VPRecipeBase *> ToRemove;
-
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
- vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
- for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
- auto *VPI = dyn_cast<VPInstruction>(&R);
- if (!VPI || VPI->getOpcode() != VPInstruction::Reverse)
- continue;
-
- SmallVector<VPValue *> Ops(VPI->operands());
- Ops.append({Plan.getTrue(), &EVL});
- auto *NewReverse = new VPWidenIntrinsicRecipe(
- Intrinsic::experimental_vp_reverse, Ops,
- TypeInfo.inferScalarType(VPI), {}, {}, VPI->getDebugLoc());
- NewReverse->insertBefore(VPI);
- VPI->replaceAllUsesWith(NewReverse);
- ToRemove.push_back(VPI);
- }
- }
-
- for (VPRecipeBase *R : ToRemove)
- R->eraseFromParent();
-}
-
/// Replace recipes with their EVL variants.
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPTypeAnalysis TypeInfo(Plan);
@@ -2989,8 +2979,44 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
}
ToErase.push_back(CurRecipe);
+
+ // Convert general reverse operations on loaded values and stored values
+ // into vp.reverse, when the VPVectorEndPointerRecipe adjusting the access
+ // address uses EVL instead of VF.
+ // TODO: Extend conversion along the def-use/use-def chain, as reverse
+ // operations may be eliminated or moved in the future.
+ if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(EVLRecipe)) {
+ if (!match(MemR->getAddr(), m_VecEndPtr(m_VPValue(), m_Specific(&EVL))))
+ continue;
+ assert(MemR->isReverse() &&
+ "Only reverse access uses VPVectorEndPointerRecipe as address");
+
+ VPRecipeBase *Candidate = nullptr;
+ if (auto *LoadR = dyn_cast<VPWidenLoadEVLRecipe>(MemR)) {
+ assert(LoadR->getNumUsers() == 1 &&
+ "Unexpected user number of reverse load");
+ Candidate = cast<VPRecipeBase>(*LoadR->user_begin());
+ } else if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(MemR)) {
+ VPValue *StoredVal = StoreR->getStoredValue();
+ // Skip if the stored value is not defined in the loop region.
+ if (StoredVal->isDefinedOutsideLoopRegions())
+ continue;
+ Candidate = StoredVal->getDefiningRecipe();
+ }
+ assert(Candidate && "Must have one reverse operation for reverse access");
+
+ if (match(Candidate, m_Intrinsic<Intrinsic::experimental_vp_reverse>()))
+ continue;
+
+ VPWidenIntrinsicRecipe *NewReverse =
+ getEVLReverse(*Candidate, TypeInfo, EVL);
+ assert(NewReverse &&
+ "Unable to get an EVL reverse when tail folding by EVL");
+ NewReverse->insertBefore(Candidate);
+ cast<VPInstruction>(Candidate)->replaceAllUsesWith(NewReverse);
+ ToErase.push_back(Candidate);
+ }
}
- convertToEVLReverse(Plan, TypeInfo, EVL);
// Remove dead EVL mask.
if (EVLMask->getNumUsers() == 0)
ToErase.push_back(EVLMask->getDefiningRecipe());
>From df90ac517452cdd19044102fdac735ecb780d509 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 11 Sep 2025 04:16:42 -0700
Subject: [PATCH 05/13] Split the appoarch that transforms the reverse to
vp.reverse for loaded result and stored value
---
llvm/lib/Transforms/Vectorize/VPlan.h | 9 ++
.../Transforms/Vectorize/VPlanTransforms.cpp | 100 ++++++++----------
2 files changed, 56 insertions(+), 53 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 3b8b8e2f2aa00..e846cb700d421 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3472,6 +3472,15 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
setMask(Mask);
}
+ VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr,
+ VPValue *StoredVal, VPValue &EVL, VPValue *Mask)
+ : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
+ {Addr, StoredVal, &EVL}, S.isConsecutive(),
+ S.isReverse(), S, S.getDebugLoc()) {
+ assert(isReverse() && "Only reverse access need to set new stored value");
+ setMask(Mask);
+ }
+
VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC)
/// Return the address accessed by this recipe.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e16a406f22964..6f8a6f20a8049 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2766,22 +2766,6 @@ static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
return RemoveMask_match<Op0_t, Op1_t>(In, Out);
}
-/// If \p R is a VPInstruction::Reverse, return a VPWidenIntrinsicRecipe
-/// for the vp.reverse intrinsic using \p EVL. Returns nullptr otherwise.
-static VPWidenIntrinsicRecipe *
-getEVLReverse(VPRecipeBase &R, VPTypeAnalysis &TypeInfo, VPValue &EVL) {
- VPValue *ReversedVal;
- if (!match(&R,
- m_VPInstruction<VPInstruction::Reverse>(m_VPValue(ReversedVal))))
- return nullptr;
-
- auto *Reverse = cast<VPInstruction>(&R);
- VPlan *Plan = Reverse->getParent()->getPlan();
- return new VPWidenIntrinsicRecipe(
- Intrinsic::experimental_vp_reverse, {ReversedVal, Plan->getTrue(), &EVL},
- TypeInfo.inferScalarType(Reverse), {}, {}, Reverse->getDebugLoc());
-}
-
/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
/// recipe could be created.
@@ -2824,12 +2808,31 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
EVL, Mask);
- if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
+ VPValue *StoredVal;
+ if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(StoredVal),
m_RemoveMask(HeaderMask, Mask))) &&
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
- cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
- return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
- AdjustEndPtr(EndPtr), EVL, Mask);
+ cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
+ auto *StoreR = cast<VPWidenStoreRecipe>(&CurRecipe);
+ // Convert general reverse operations on stored value into vp.reverse.
+ // Skip if the stored value is not defined in the loop region.
+ if (!StoredVal->isDefinedOutsideLoopRegions()) {
+ VPValue *ReversedVal;
+ bool IsReverse = match(StoredVal, m_VPInstruction<VPInstruction::Reverse>(
+ m_VPValue(ReversedVal)));
+ assert(IsReverse && "The stored value of reverse store must be defined "
+ "by a reverse operation");
+ auto *Reverse = cast<VPInstruction>(StoredVal);
+ auto *NewReverse = new VPWidenIntrinsicRecipe(
+ Intrinsic::experimental_vp_reverse,
+ {ReversedVal, Plan->getTrue(), &EVL},
+ TypeInfo.inferScalarType(Reverse), {}, {}, Reverse->getDebugLoc());
+ NewReverse->insertBefore(Reverse);
+ return new VPWidenStoreEVLRecipe(*StoreR, AdjustEndPtr(EndPtr),
+ NewReverse, EVL, Mask);
+ }
+ return new VPWidenStoreEVLRecipe(*StoreR, AdjustEndPtr(EndPtr), EVL, Mask);
+ }
if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
if (Rdx->isConditional() &&
@@ -2980,41 +2983,32 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
ToErase.push_back(CurRecipe);
- // Convert general reverse operations on loaded values and stored values
- // into vp.reverse, when the VPVectorEndPointerRecipe adjusting the access
- // address uses EVL instead of VF.
- // TODO: Extend conversion along the def-use/use-def chain, as reverse
- // operations may be eliminated or moved in the future.
- if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(EVLRecipe)) {
- if (!match(MemR->getAddr(), m_VecEndPtr(m_VPValue(), m_Specific(&EVL))))
+ // Convert general reverse operations on loaded results into vp.reverse,
+ // when the VPVectorEndPointerRecipe adjusting the access address uses EVL
+ // instead of VF.
+ if (auto *LoadR = dyn_cast<VPWidenLoadEVLRecipe>(EVLRecipe)) {
+ if (!match(LoadR->getAddr(), m_VecEndPtr(m_VPValue(), m_Specific(&EVL))))
continue;
- assert(MemR->isReverse() &&
+ assert(LoadR->isReverse() &&
"Only reverse access uses VPVectorEndPointerRecipe as address");
-
- VPRecipeBase *Candidate = nullptr;
- if (auto *LoadR = dyn_cast<VPWidenLoadEVLRecipe>(MemR)) {
- assert(LoadR->getNumUsers() == 1 &&
- "Unexpected user number of reverse load");
- Candidate = cast<VPRecipeBase>(*LoadR->user_begin());
- } else if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(MemR)) {
- VPValue *StoredVal = StoreR->getStoredValue();
- // Skip if the stored value is not defined in the loop region.
- if (StoredVal->isDefinedOutsideLoopRegions())
- continue;
- Candidate = StoredVal->getDefiningRecipe();
- }
- assert(Candidate && "Must have one reverse operation for reverse access");
-
- if (match(Candidate, m_Intrinsic<Intrinsic::experimental_vp_reverse>()))
- continue;
-
- VPWidenIntrinsicRecipe *NewReverse =
- getEVLReverse(*Candidate, TypeInfo, EVL);
- assert(NewReverse &&
- "Unable to get an EVL reverse when tail folding by EVL");
- NewReverse->insertBefore(Candidate);
- cast<VPInstruction>(Candidate)->replaceAllUsesWith(NewReverse);
- ToErase.push_back(Candidate);
+ // TODO: Extend conversion along the use-def chain, as reverse operations
+ // may be eliminated or sunk in the future.
+ assert(LoadR->getNumUsers() == 1 &&
+ "Unexpected user number of reverse load");
+ auto *UserR = cast<VPRecipeBase>(*LoadR->user_begin());
+ VPValue *ReversedVal;
+ bool IsReverse = match(UserR, m_VPInstruction<VPInstruction::Reverse>(
+ m_VPValue(ReversedVal)));
+ assert(IsReverse && "The defined value of reverse load must be used by a "
+ "reverse operation");
+ auto *Reverse = cast<VPInstruction>(UserR);
+ auto *NewReverse = new VPWidenIntrinsicRecipe(
+ Intrinsic::experimental_vp_reverse,
+ {ReversedVal, Plan.getTrue(), &EVL},
+ TypeInfo.inferScalarType(Reverse), {}, {}, Reverse->getDebugLoc());
+ NewReverse->insertBefore(Reverse);
+ Reverse->replaceAllUsesWith(NewReverse);
+ ToErase.push_back(Reverse);
}
}
// Remove dead EVL mask.
>From e99468bb8af3f29103e5471cb29e814885bca54e Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 20 Nov 2025 01:10:05 -0800
Subject: [PATCH 06/13] Move the code in planContainsAdditionalSimplifications
---
.../Transforms/Vectorize/LoopVectorize.cpp | 31 +++++++++----------
1 file changed, 14 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f47e082cb2a0f..a5a9b24a3c2f2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7055,6 +7055,20 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
return true;
+
+ if (WidenMemR->isReverse()) {
+ // If the stored value of a reverse store is invariant, LICM will
+ // hoist the reverse operation to the preheader. In this case, the
+ // result of the VPlan-based cost model will diverge from that of
+ // the legacy model.
+ if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(WidenMemR))
+ if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
+ return true;
+
+ if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(WidenMemR))
+ if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
+ return true;
+ }
}
/// If a VPlan transform folded a recipe to one producing a single-scalar,
@@ -7075,23 +7089,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
cast<VPRecipeWithIRFlags>(R).getPredicate() !=
cast<CmpInst>(UI)->getPredicate())
return true;
-
- if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
- if (MemR->isReverse()) {
- // If the stored value of a reverse store is invariant, LICM will
- // hoist the reverse operation to the preheader. In this case, the
- // result of the VPlan-based cost model will diverge from that of
- // the legacy model.
- if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemR))
- if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
- return true;
-
- if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(MemR))
- if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
- return true;
- }
- }
-
SeenInstrs.insert(UI);
}
}
>From d0436f5d35316101e4052adffd386398d1da08f9 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 26 Nov 2025 00:11:14 -0800
Subject: [PATCH 07/13] Revert "Split the appoarch that transforms the reverse
to vp.reverse for loaded result and stored value"
This reverts commit 2ba86017dc77a92cfc1189b340c5551493ded368.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 9 --
.../Transforms/Vectorize/VPlanTransforms.cpp | 100 ++++++++++--------
2 files changed, 53 insertions(+), 56 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e846cb700d421..3b8b8e2f2aa00 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3472,15 +3472,6 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
setMask(Mask);
}
- VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr,
- VPValue *StoredVal, VPValue &EVL, VPValue *Mask)
- : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
- {Addr, StoredVal, &EVL}, S.isConsecutive(),
- S.isReverse(), S, S.getDebugLoc()) {
- assert(isReverse() && "Only reverse access need to set new stored value");
- setMask(Mask);
- }
-
VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC)
/// Return the address accessed by this recipe.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6f8a6f20a8049..e16a406f22964 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2766,6 +2766,22 @@ static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
return RemoveMask_match<Op0_t, Op1_t>(In, Out);
}
+/// If \p R is a VPInstruction::Reverse, return a VPWidenIntrinsicRecipe
+/// for the vp.reverse intrinsic using \p EVL. Returns nullptr otherwise.
+static VPWidenIntrinsicRecipe *
+getEVLReverse(VPRecipeBase &R, VPTypeAnalysis &TypeInfo, VPValue &EVL) {
+ VPValue *ReversedVal;
+ if (!match(&R,
+ m_VPInstruction<VPInstruction::Reverse>(m_VPValue(ReversedVal))))
+ return nullptr;
+
+ auto *Reverse = cast<VPInstruction>(&R);
+ VPlan *Plan = Reverse->getParent()->getPlan();
+ return new VPWidenIntrinsicRecipe(
+ Intrinsic::experimental_vp_reverse, {ReversedVal, Plan->getTrue(), &EVL},
+ TypeInfo.inferScalarType(Reverse), {}, {}, Reverse->getDebugLoc());
+}
+
/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
/// recipe could be created.
@@ -2808,31 +2824,12 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
EVL, Mask);
- VPValue *StoredVal;
- if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(StoredVal),
+ if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
m_RemoveMask(HeaderMask, Mask))) &&
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
- cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
- auto *StoreR = cast<VPWidenStoreRecipe>(&CurRecipe);
- // Convert general reverse operations on stored value into vp.reverse.
- // Skip if the stored value is not defined in the loop region.
- if (!StoredVal->isDefinedOutsideLoopRegions()) {
- VPValue *ReversedVal;
- bool IsReverse = match(StoredVal, m_VPInstruction<VPInstruction::Reverse>(
- m_VPValue(ReversedVal)));
- assert(IsReverse && "The stored value of reverse store must be defined "
- "by a reverse operation");
- auto *Reverse = cast<VPInstruction>(StoredVal);
- auto *NewReverse = new VPWidenIntrinsicRecipe(
- Intrinsic::experimental_vp_reverse,
- {ReversedVal, Plan->getTrue(), &EVL},
- TypeInfo.inferScalarType(Reverse), {}, {}, Reverse->getDebugLoc());
- NewReverse->insertBefore(Reverse);
- return new VPWidenStoreEVLRecipe(*StoreR, AdjustEndPtr(EndPtr),
- NewReverse, EVL, Mask);
- }
- return new VPWidenStoreEVLRecipe(*StoreR, AdjustEndPtr(EndPtr), EVL, Mask);
- }
+ cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
+ return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
+ AdjustEndPtr(EndPtr), EVL, Mask);
if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
if (Rdx->isConditional() &&
@@ -2983,32 +2980,41 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
ToErase.push_back(CurRecipe);
- // Convert general reverse operations on loaded results into vp.reverse,
- // when the VPVectorEndPointerRecipe adjusting the access address uses EVL
- // instead of VF.
- if (auto *LoadR = dyn_cast<VPWidenLoadEVLRecipe>(EVLRecipe)) {
- if (!match(LoadR->getAddr(), m_VecEndPtr(m_VPValue(), m_Specific(&EVL))))
+ // Convert general reverse operations on loaded values and stored values
+ // into vp.reverse, when the VPVectorEndPointerRecipe adjusting the access
+ // address uses EVL instead of VF.
+ // TODO: Extend conversion along the def-use/use-def chain, as reverse
+ // operations may be eliminated or moved in the future.
+ if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(EVLRecipe)) {
+ if (!match(MemR->getAddr(), m_VecEndPtr(m_VPValue(), m_Specific(&EVL))))
continue;
- assert(LoadR->isReverse() &&
+ assert(MemR->isReverse() &&
"Only reverse access uses VPVectorEndPointerRecipe as address");
- // TODO: Extend conversion along the use-def chain, as reverse operations
- // may be eliminated or sunk in the future.
- assert(LoadR->getNumUsers() == 1 &&
- "Unexpected user number of reverse load");
- auto *UserR = cast<VPRecipeBase>(*LoadR->user_begin());
- VPValue *ReversedVal;
- bool IsReverse = match(UserR, m_VPInstruction<VPInstruction::Reverse>(
- m_VPValue(ReversedVal)));
- assert(IsReverse && "The defined value of reverse load must be used by a "
- "reverse operation");
- auto *Reverse = cast<VPInstruction>(UserR);
- auto *NewReverse = new VPWidenIntrinsicRecipe(
- Intrinsic::experimental_vp_reverse,
- {ReversedVal, Plan.getTrue(), &EVL},
- TypeInfo.inferScalarType(Reverse), {}, {}, Reverse->getDebugLoc());
- NewReverse->insertBefore(Reverse);
- Reverse->replaceAllUsesWith(NewReverse);
- ToErase.push_back(Reverse);
+
+ VPRecipeBase *Candidate = nullptr;
+ if (auto *LoadR = dyn_cast<VPWidenLoadEVLRecipe>(MemR)) {
+ assert(LoadR->getNumUsers() == 1 &&
+ "Unexpected user number of reverse load");
+ Candidate = cast<VPRecipeBase>(*LoadR->user_begin());
+ } else if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(MemR)) {
+ VPValue *StoredVal = StoreR->getStoredValue();
+ // Skip if the stored value is not defined in the loop region.
+ if (StoredVal->isDefinedOutsideLoopRegions())
+ continue;
+ Candidate = StoredVal->getDefiningRecipe();
+ }
+ assert(Candidate && "Must have one reverse operation for reverse access");
+
+ if (match(Candidate, m_Intrinsic<Intrinsic::experimental_vp_reverse>()))
+ continue;
+
+ VPWidenIntrinsicRecipe *NewReverse =
+ getEVLReverse(*Candidate, TypeInfo, EVL);
+ assert(NewReverse &&
+ "Unable to get an EVL reverse when tail folding by EVL");
+ NewReverse->insertBefore(Candidate);
+ cast<VPInstruction>(Candidate)->replaceAllUsesWith(NewReverse);
+ ToErase.push_back(Candidate);
}
}
// Remove dead EVL mask.
>From 8dd5804750db128e336d282bcdaf5f5868592eb8 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 26 Nov 2025 00:11:54 -0800
Subject: [PATCH 08/13] Revert "Change the way to convert reverse operation
When EVL lowering."
This reverts commit 293a127bd8811aeba7905145386b50b19b2a96e8.
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 80 +++++++------------
1 file changed, 27 insertions(+), 53 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e16a406f22964..75a2ee1a0869c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2766,22 +2766,6 @@ static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
return RemoveMask_match<Op0_t, Op1_t>(In, Out);
}
-/// If \p R is a VPInstruction::Reverse, return a VPWidenIntrinsicRecipe
-/// for the vp.reverse intrinsic using \p EVL. Returns nullptr otherwise.
-static VPWidenIntrinsicRecipe *
-getEVLReverse(VPRecipeBase &R, VPTypeAnalysis &TypeInfo, VPValue &EVL) {
- VPValue *ReversedVal;
- if (!match(&R,
- m_VPInstruction<VPInstruction::Reverse>(m_VPValue(ReversedVal))))
- return nullptr;
-
- auto *Reverse = cast<VPInstruction>(&R);
- VPlan *Plan = Reverse->getParent()->getPlan();
- return new VPWidenIntrinsicRecipe(
- Intrinsic::experimental_vp_reverse, {ReversedVal, Plan->getTrue(), &EVL},
- TypeInfo.inferScalarType(Reverse), {}, {}, Reverse->getDebugLoc());
-}
-
/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
/// recipe could be created.
@@ -2865,6 +2849,32 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return nullptr;
}
+static void convertToEVLReverse(VPlan &Plan, VPTypeAnalysis &TypeInfo,
+ VPValue &EVL) {
+ SmallVector<VPRecipeBase *> ToRemove;
+
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+ for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
+ auto *VPI = dyn_cast<VPInstruction>(&R);
+ if (!VPI || VPI->getOpcode() != VPInstruction::Reverse)
+ continue;
+
+ SmallVector<VPValue *> Ops(VPI->operands());
+ Ops.append({Plan.getTrue(), &EVL});
+ auto *NewReverse = new VPWidenIntrinsicRecipe(
+ Intrinsic::experimental_vp_reverse, Ops,
+ TypeInfo.inferScalarType(VPI), {}, {}, VPI->getDebugLoc());
+ NewReverse->insertBefore(VPI);
+ VPI->replaceAllUsesWith(NewReverse);
+ ToRemove.push_back(VPI);
+ }
+ }
+
+ for (VPRecipeBase *R : ToRemove)
+ R->eraseFromParent();
+}
+
/// Replace recipes with their EVL variants.
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPTypeAnalysis TypeInfo(Plan);
@@ -2979,44 +2989,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
}
ToErase.push_back(CurRecipe);
-
- // Convert general reverse operations on loaded values and stored values
- // into vp.reverse, when the VPVectorEndPointerRecipe adjusting the access
- // address uses EVL instead of VF.
- // TODO: Extend conversion along the def-use/use-def chain, as reverse
- // operations may be eliminated or moved in the future.
- if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(EVLRecipe)) {
- if (!match(MemR->getAddr(), m_VecEndPtr(m_VPValue(), m_Specific(&EVL))))
- continue;
- assert(MemR->isReverse() &&
- "Only reverse access uses VPVectorEndPointerRecipe as address");
-
- VPRecipeBase *Candidate = nullptr;
- if (auto *LoadR = dyn_cast<VPWidenLoadEVLRecipe>(MemR)) {
- assert(LoadR->getNumUsers() == 1 &&
- "Unexpected user number of reverse load");
- Candidate = cast<VPRecipeBase>(*LoadR->user_begin());
- } else if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(MemR)) {
- VPValue *StoredVal = StoreR->getStoredValue();
- // Skip if the stored value is not defined in the loop region.
- if (StoredVal->isDefinedOutsideLoopRegions())
- continue;
- Candidate = StoredVal->getDefiningRecipe();
- }
- assert(Candidate && "Must have one reverse operation for reverse access");
-
- if (match(Candidate, m_Intrinsic<Intrinsic::experimental_vp_reverse>()))
- continue;
-
- VPWidenIntrinsicRecipe *NewReverse =
- getEVLReverse(*Candidate, TypeInfo, EVL);
- assert(NewReverse &&
- "Unable to get an EVL reverse when tail folding by EVL");
- NewReverse->insertBefore(Candidate);
- cast<VPInstruction>(Candidate)->replaceAllUsesWith(NewReverse);
- ToErase.push_back(Candidate);
- }
}
+ convertToEVLReverse(Plan, TypeInfo, EVL);
// Remove dead EVL mask.
if (EVLMask->getNumUsers() == 0)
ToErase.push_back(EVLMask->getDefiningRecipe());
>From 4aafcdeee8a0a1d8fa66f19151b4f3e69e3b604f Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 26 Nov 2025 02:09:21 -0800
Subject: [PATCH 09/13] Reuse traversal loop
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 72 ++++++++-----------
1 file changed, 30 insertions(+), 42 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 75a2ee1a0869c..6f707cc116c6a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2849,32 +2849,6 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return nullptr;
}
-static void convertToEVLReverse(VPlan &Plan, VPTypeAnalysis &TypeInfo,
- VPValue &EVL) {
- SmallVector<VPRecipeBase *> ToRemove;
-
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
- vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
- for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
- auto *VPI = dyn_cast<VPInstruction>(&R);
- if (!VPI || VPI->getOpcode() != VPInstruction::Reverse)
- continue;
-
- SmallVector<VPValue *> Ops(VPI->operands());
- Ops.append({Plan.getTrue(), &EVL});
- auto *NewReverse = new VPWidenIntrinsicRecipe(
- Intrinsic::experimental_vp_reverse, Ops,
- TypeInfo.inferScalarType(VPI), {}, {}, VPI->getDebugLoc());
- NewReverse->insertBefore(VPI);
- VPI->replaceAllUsesWith(NewReverse);
- ToRemove.push_back(VPI);
- }
- }
-
- for (VPRecipeBase *R : ToRemove)
- R->eraseFromParent();
-}
-
/// Replace recipes with their EVL variants.
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPTypeAnalysis TypeInfo(Plan);
@@ -2912,6 +2886,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
// contained.
bool ContainsFORs =
any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
+ VPValue *PrevEVL = nullptr;
if (ContainsFORs) {
// TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
VPValue *MaxEVL = &Plan.getVF();
@@ -2922,28 +2897,42 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
Builder.setInsertPoint(Header, Header->getFirstNonPhi());
- VPValue *PrevEVL = Builder.createScalarPhi(
- {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
-
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
- vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
- for (VPRecipeBase &R : *VPBB) {
- VPValue *V1, *V2;
- if (!match(&R,
- m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
- m_VPValue(V1), m_VPValue(V2))))
- continue;
+ PrevEVL = Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc::getUnknown(),
+ "prev.evl");
+ }
+
+ // Transform the recipes must be converted to vector predication intrinsics
+ // even if they do not use header mask.
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
+ for (VPRecipeBase &R : *VPBB) {
+ VPWidenIntrinsicRecipe *NewRecipe = nullptr;
+ VPValue *V1, *V2;
+ if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
+ m_VPValue(V1), m_VPValue(V2)))) {
VPValue *Imm = Plan.getOrAddLiveIn(
ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1));
- VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
+ NewRecipe = new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_splice,
{V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
R.getDebugLoc());
- VPSplice->insertBefore(&R);
- R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
- ToErase.push_back(&R);
}
+
+ // TODO: Only convert reverse to vp.reverse if it uses the result of
+ // vp.load, or defines the stored value of vp.store.
+ if (match(&R, m_VPInstruction<VPInstruction::Reverse>(m_VPValue(V1)))) {
+ NewRecipe = new VPWidenIntrinsicRecipe(
+ Intrinsic::experimental_vp_reverse, {V1, Plan.getTrue(), &EVL},
+ TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
+ R.getDebugLoc());
+ }
+
+ if (!NewRecipe)
+ continue;
+ NewRecipe->insertBefore(&R);
+ R.getVPSingleValue()->replaceAllUsesWith(NewRecipe);
+ ToErase.push_back(&R);
}
}
@@ -2990,7 +2979,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
ToErase.push_back(CurRecipe);
}
- convertToEVLReverse(Plan, TypeInfo, EVL);
// Remove dead EVL mask.
if (EVLMask->getNumUsers() == 0)
ToErase.push_back(EVLMask->getDefiningRecipe());
>From 2a258a5461093f76ca7d3fa0a9a7146e65fca400 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Sat, 29 Nov 2025 23:21:19 -0800
Subject: [PATCH 10/13] Remove "using namespace llvm::VPlanPatternMatch;"
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 23ecda9d85c58..2fb83567894ee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2271,7 +2271,6 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
return TTI::CastContextHint::Normal;
};
- using namespace llvm::VPlanPatternMatch;
VPValue *Operand = getOperand(0);
TTI::CastContextHint CCH = TTI::CastContextHint::None;
// For Trunc/FPTrunc, get the context from the only user.
>From adef7d6334481307375d9e1aa2c01c960c1639f1 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 4 Dec 2025 00:17:11 -0800
Subject: [PATCH 11/13] Intro m_Reverse
---
llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 750ef8edd94bb..6f7d357dc29e8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -454,6 +454,12 @@ m_LastActiveLane(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::LastActiveLane>(Op0);
}
+template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::Reverse, Op0_t>
+m_Reverse(const Op0_t &Op0) {
+ return m_VPInstruction<VPInstruction::Reverse>(Op0);
+}
+
inline VPInstruction_match<VPInstruction::StepVector> m_StepVector() {
return m_VPInstruction<VPInstruction::StepVector>();
}
>From c76d928182a1676d8d7db77090c1c9e969c7ebcf Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 4 Dec 2025 00:17:35 -0800
Subject: [PATCH 12/13] nfc, replace m_VPInstruction<VPInstruction::Reverse>
with m_Reverse
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 ++---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2fb83567894ee..d1d16a21002e7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2282,7 +2282,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
};
if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
- if (match(Recipe, m_VPInstruction<VPInstruction::Reverse>(m_VPValue())))
+ if (match(Recipe, m_Reverse(m_VPValue())))
Recipe = GetOnlyUser(cast<VPInstruction>(Recipe));
if (Recipe)
CCH = ComputeCCH(Recipe);
@@ -2295,8 +2295,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
CCH = TTI::CastContextHint::Normal;
else if (auto *Recipe = Operand->getDefiningRecipe()) {
VPValue *ReverseOp;
- if (match(Recipe,
- m_VPInstruction<VPInstruction::Reverse>(m_VPValue(ReverseOp))))
+ if (match(Recipe, m_Reverse(m_VPValue(ReverseOp))))
Recipe = ReverseOp->getDefiningRecipe();
if (Recipe)
CCH = ComputeCCH(Recipe);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6f707cc116c6a..16a4a55f3c2f6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2921,7 +2921,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
// TODO: Only convert reverse to vp.reverse if it uses the result of
// vp.load, or defines the stored value of vp.store.
- if (match(&R, m_VPInstruction<VPInstruction::Reverse>(m_VPValue(V1)))) {
+ if (match(&R, m_Reverse(m_VPValue(V1)))) {
NewRecipe = new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_reverse, {V1, Plan.getTrue(), &EVL},
TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
>From 7b22bdf8e77f7a7d17db923865d1c0cec5087584 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 4 Dec 2025 00:26:11 -0800
Subject: [PATCH 13/13] The third conversion method for vp.reverse
It may temporarily lose some performance when EVL tail folding.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 9 +++
.../Transforms/Vectorize/VPlanTransforms.cpp | 79 ++++++++++---------
.../RISCV/tail-folding-uniform-store.ll | 2 +-
3 files changed, 51 insertions(+), 39 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 3b8b8e2f2aa00..e846cb700d421 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3472,6 +3472,15 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
setMask(Mask);
}
+ VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr,
+ VPValue *StoredVal, VPValue &EVL, VPValue *Mask)
+ : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
+ {Addr, StoredVal, &EVL}, S.isConsecutive(),
+ S.isReverse(), S, S.getDebugLoc()) {
+ assert(isReverse() && "Only reverse access need to set new stored value");
+ setMask(Mask);
+ }
+
VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC)
/// Return the address accessed by this recipe.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 16a4a55f3c2f6..9927e9b167195 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2795,12 +2795,19 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
EVL, Mask);
- if (match(&CurRecipe,
+ VPValue *ReversedVal;
+ if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
+ match(ReversedVal,
m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
- cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
- return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
- AdjustEndPtr(EndPtr), EVL, Mask);
+ cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
+ auto *LoadR = new VPWidenLoadEVLRecipe(
+ *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
+ LoadR->insertBefore(&CurRecipe);
+ return new VPWidenIntrinsicRecipe(
+ Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
+ TypeInfo.inferScalarType(LoadR), {}, {}, DL);
+ }
if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
m_RemoveMask(HeaderMask, Mask))) &&
@@ -2808,12 +2815,23 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
EVL, Mask);
- if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
+ VPValue *StoredVal;
+ if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(StoredVal),
m_RemoveMask(HeaderMask, Mask))) &&
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
- cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
- return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
- AdjustEndPtr(EndPtr), EVL, Mask);
+ cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
+ if (match(StoredVal, m_Reverse(m_VPValue(ReversedVal)))) {
+ auto *NewReverse = new VPWidenIntrinsicRecipe(
+ Intrinsic::experimental_vp_reverse,
+ {ReversedVal, Plan->getTrue(), &EVL},
+ TypeInfo.inferScalarType(ReversedVal), {}, {},
+ cast<VPInstruction>(StoredVal)->getDebugLoc());
+ NewReverse->insertBefore(&CurRecipe);
+ return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
+ AdjustEndPtr(EndPtr), NewReverse, EVL,
+ Mask);
+ }
+ }
if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
if (Rdx->isConditional() &&
@@ -2886,7 +2904,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
// contained.
bool ContainsFORs =
any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
- VPValue *PrevEVL = nullptr;
if (ContainsFORs) {
// TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
VPValue *MaxEVL = &Plan.getVF();
@@ -2897,42 +2914,28 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
Builder.setInsertPoint(Header, Header->getFirstNonPhi());
- PrevEVL = Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc::getUnknown(),
- "prev.evl");
- }
-
- // Transform the recipes must be converted to vector predication intrinsics
- // even if they do not use header mask.
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
- vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
- for (VPRecipeBase &R : *VPBB) {
- VPWidenIntrinsicRecipe *NewRecipe = nullptr;
- VPValue *V1, *V2;
- if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
- m_VPValue(V1), m_VPValue(V2)))) {
+ VPValue *PrevEVL = Builder.createScalarPhi(
+ {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
+
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
+ for (VPRecipeBase &R : *VPBB) {
+ VPValue *V1, *V2;
+ if (!match(&R,
+ m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
+ m_VPValue(V1), m_VPValue(V2))))
+ continue;
VPValue *Imm = Plan.getOrAddLiveIn(
ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1));
- NewRecipe = new VPWidenIntrinsicRecipe(
+ VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_splice,
{V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
R.getDebugLoc());
+ VPSplice->insertBefore(&R);
+ R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
+ ToErase.push_back(&R);
}
-
- // TODO: Only convert reverse to vp.reverse if it uses the result of
- // vp.load, or defines the stored value of vp.store.
- if (match(&R, m_Reverse(m_VPValue(V1)))) {
- NewRecipe = new VPWidenIntrinsicRecipe(
- Intrinsic::experimental_vp_reverse, {V1, Plan.getTrue(), &EVL},
- TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
- R.getDebugLoc());
- }
-
- if (!NewRecipe)
- continue;
- NewRecipe->insertBefore(&R);
- R.getVPSingleValue()->replaceAllUsesWith(NewRecipe);
- ToErase.push_back(&R);
}
}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
index ad0cc57deb8a7..e523de9de7a26 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
@@ -15,7 +15,6 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 3, [[SPEC_SELECT]]
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer)
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -24,6 +23,7 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP12:%.*]] = sub nuw nsw i64 1, [[OFFSET_IDX]]
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
+; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 0, [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP4]], 1
More information about the llvm-commits
mailing list