[llvm] [VPlan] Extract reverse interleave pointer adjustment into VPReverseInterleavePtrRecipe (PR #144864)

Tue Jun 24 02:23:08 PDT 2025

https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/144864

>From 0f530f85945d2a07fd1d059436fed10522e449de Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 19 Jun 2025 01:44:28 -0700
Subject: [PATCH 1/4] first step, Interleave accesses for EVL tail folding. POC

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  1 +
 llvm/lib/Transforms/Vectorize/VPlan.h         | 49 +++++++++++++++++++
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  7 +--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 47 +++++++++++-------
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 15 ++++++
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |  1 +
 .../AArch64/sve-interleaved-accesses.ll       | 16 +++---
 7 files changed, 106 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f887b34e76422..ce40c6ccba92e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4256,6 +4256,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       case VPDef::VPDerivedIVSC:
       case VPDef::VPScalarIVStepsSC:
       case VPDef::VPReplicateSC:
+      case VPDef::VPReverseInterleavePtrSC:
       case VPDef::VPInstructionSC:
       case VPDef::VPCanonicalIVPHISC:
       case VPDef::VPVectorPointerSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f3306ad7cb8ec..daef26fe86d79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -531,6 +531,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionEVLSC:
     case VPRecipeBase::VPReductionSC:
+    case VPRecipeBase::VPReverseInterleavePtrSC:
     case VPRecipeBase::VPMulAccumulateReductionSC:
     case VPRecipeBase::VPExtendedReductionSC:
     case VPRecipeBase::VPReplicateSC:
@@ -851,6 +852,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
            R->getVPDefID() == VPRecipeBase::VPReductionSC ||
            R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
            R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
+           R->getVPDefID() == VPRecipeBase::VPReverseInterleavePtrSC ||
            R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
            R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
            R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
@@ -1796,6 +1798,53 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
 #endif
 };
 
+class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags {
+  Type *IndexedTy;
+  unsigned Factor;
+
+public:
+  VPReverseInterleavePtrRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
+                               unsigned Factor, GEPNoWrapFlags GEPFlags,
+                               DebugLoc DL)
+      : VPRecipeWithIRFlags(VPDef::VPReverseInterleavePtrSC,
+                            ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
+        IndexedTy(IndexedTy), Factor(Factor) {
+    assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPReverseInterleavePtrSC)
+
+  VPValue *getPtr() const { return getOperand(0); }
+
+  VPValue *getVFValue() const { return getOperand(1); }
+
+  void execute(VPTransformState &State) override;
+
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
+
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override {
+    // TODO: Compute accurate cost after retiring the legacy cost model.
+    return 0;
+  }
+
+  VPReverseInterleavePtrRecipe *clone() override {
+    return new VPReverseInterleavePtrRecipe(getPtr(), getVFValue(), IndexedTy,
+                                            Factor, getGEPNoWrapFlags(),
+                                            getDebugLoc());
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A pure virtual base class for all recipes modeling header phis, including
 /// phis for first order recurrences, pointer inductions and reductions. The
 /// start value is the first operand of the recipe and the incoming value from
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 76da5b0314a8e..98889cb5c520c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -282,9 +282,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
                 VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
                 VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
-                VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
-            return inferScalarType(R->getOperand(0));
-          })
+                VPPartialReductionRecipe, VPReverseInterleavePtrRecipe>(
+              [this](const VPRecipeBase *R) {
+                return inferScalarType(R->getOperand(0));
+              })
           // VPInstructionWithType must be handled before VPInstruction.
           .Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
                 VPWidenCastRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1ed0b97849a8d..40dde8cfaea73 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -150,6 +150,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPDerivedIVSC:
   case VPFirstOrderRecurrencePHISC:
   case VPPredInstPHISC:
+  case VPReverseInterleavePtrSC:
   case VPVectorEndPointerSC:
     return false;
   case VPInstructionSC:
@@ -2262,6 +2263,33 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPReverseInterleavePtrRecipe::execute(VPTransformState &State) {
+  auto &Builder = State.Builder;
+  Value *Ptr = State.get(getPtr(), /*IsScalar*/ true);
+  Value *RuntimeVF = State.get(getVFValue(), /*IsScalar*/ true);
+  Type *IndexTy = Builder.getInt32Ty();
+  if (RuntimeVF->getType() != IndexTy)
+    RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, IndexTy);
+  Value *Index = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
+  Index = Builder.CreateMul(Index, Builder.getInt32(Factor));
+  Index = Builder.CreateNeg(Index);
+  Value *ReversePtr =
+      Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
+
+  State.set(this, ReversePtr, /*IsScalar*/ true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPReverseInterleavePtrRecipe::print(raw_ostream &O, const Twine &Indent,
+                                         VPSlotTracker &SlotTracker) const {
+  O << Indent;
+  printAsOperand(O, SlotTracker);
+  O << " = reverse-interleave-ptr";
+  printFlags(O);
+  printOperands(O, SlotTracker);
+}
+#endif
+
 void VPBlendRecipe::execute(VPTransformState &State) {
   assert(isNormalized() && "Expected blend to be normalized!");
   // We know that all PHIs in non-header blocks are converted into
@@ -3223,25 +3251,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
   if (auto *I = dyn_cast<Instruction>(ResAddr))
     State.setDebugLocFrom(I->getDebugLoc());
 
-  // If the group is reverse, adjust the index to refer to the last vector lane
-  // instead of the first. We adjust the index from the first vector lane,
-  // rather than directly getting the pointer for lane VF - 1, because the
-  // pointer operand of the interleaved access is supposed to be uniform.
-  if (Group->isReverse()) {
-    Value *RuntimeVF =
-        getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
-    Value *Index =
-        State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
-    Index = State.Builder.CreateMul(Index,
-                                    State.Builder.getInt32(Group->getFactor()));
-    Index = State.Builder.CreateNeg(Index);
-
-    bool InBounds = false;
-    if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
-      InBounds = Gep->isInBounds();
-    ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
-  }
-
   State.setDebugLocFrom(getDebugLoc());
   Value *PoisonVec = PoisonValue::get(VecTy);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 11f0f2a930329..6068b87663047 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2489,6 +2489,21 @@ void VPlanTransforms::createInterleaveGroups(
       Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
                       : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
     }
+    // If the group is reverse, adjust the index to refer to the last vector
+    // lane instead of the first. We adjust the index from the first vector
+    // lane, rather than directly getting the pointer for lane VF - 1, because
+    // the pointer operand of the interleaved access is supposed to be uniform.
+    if (IG->isReverse()) {
+      auto *GEP = dyn_cast<GetElementPtrInst>(
+          getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts());
+      auto *ReversePtr = new VPReverseInterleavePtrRecipe(
+          Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(),
+          GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
+                                   : GEPNoWrapFlags::none(),
+          InsertPos->getDebugLoc());
+      ReversePtr->insertBefore(InsertPos);
+      Addr = ReversePtr;
+    }
     auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
                                         InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc());
     VPIG->insertBefore(InsertPos);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index a0d3dc9b934cc..83f6ac223af1e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -335,6 +335,7 @@ class VPDef {
     VPInterleaveSC,
     VPReductionEVLSC,
     VPReductionSC,
+    VPReverseInterleavePtrSC,
     VPMulAccumulateReductionSC,
     VPExtendedReductionSC,
     VPPartialReductionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 7e4edf739695a..0333035a4b0bf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -367,8 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
@@ -381,8 +381,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
@@ -1579,8 +1579,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
@@ -1599,8 +1599,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
 ; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2
 ; CHECK-NEXT:    [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]

>From 6270d1ad02363fb307a90a29f944e3621cf5ea1c Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 20 Jun 2025 01:44:32 -0700
Subject: [PATCH 2/4] Stride VectorEndPointer for reverse interleaved access

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  5 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         | 11 ++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 19 +++---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  5 +-
 .../AArch64/sve-interleaved-accesses.ll       | 24 +++----
 .../RISCV/riscv-vector-reverse-output.ll      | 48 ++++++++-----
 .../RISCV/riscv-vector-reverse.ll             | 68 ++++++++++---------
 ...-force-tail-with-evl-reverse-load-store.ll | 21 ++++--
 ...orize-force-tail-with-evl-uniform-store.ll |  3 +-
 9 files changed, 117 insertions(+), 87 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ce40c6ccba92e..38b63bcf648e2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7767,8 +7767,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
           (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
               ? GEPNoWrapFlags::none()
               : GEPNoWrapFlags::inBounds();
-      VectorPtr = new VPVectorEndPointerRecipe(
-          Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
+      VectorPtr =
+          new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
+                                       /*Stride*/ -1, Flags, I->getDebugLoc());
     } else {
       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
                                             GEP ? GEP->getNoWrapFlags()
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index daef26fe86d79..5b487f301707c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1702,12 +1702,16 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
                                  public VPUnrollPartAccessor<2> {
   Type *IndexedTy;
 
+  int64_t Stride;
+
 public:
   VPVectorEndPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
-                           GEPNoWrapFlags GEPFlags, DebugLoc DL)
+                           int64_t Stride, GEPNoWrapFlags GEPFlags, DebugLoc DL)
       : VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC,
                             ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
-        IndexedTy(IndexedTy) {}
+        IndexedTy(IndexedTy), Stride(Stride) {
+    assert(Stride != 0 && "Unexpected stride");
+  }
 
   VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC)
 
@@ -1739,7 +1743,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
 
   VPVectorEndPointerRecipe *clone() override {
     return new VPVectorEndPointerRecipe(getOperand(0), getVFValue(), IndexedTy,
-                                        getGEPNoWrapFlags(), getDebugLoc());
+                                        Stride, getGEPNoWrapFlags(),
+                                        getDebugLoc());
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 40dde8cfaea73..3f70111947dbd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2193,12 +2193,12 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-static Type *getGEPIndexTy(bool IsScalable, bool IsReverse,
+static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
                            unsigned CurrentPart, IRBuilderBase &Builder) {
   // Use i32 for the gep index type when the value is constant,
   // or query DataLayout for a more suitable index type otherwise.
   const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
-  return IsScalable && (IsReverse || CurrentPart > 0)
+  return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
              ? DL.getIndexType(Builder.getPtrTy(0))
              : Builder.getInt32Ty();
 }
@@ -2206,18 +2206,21 @@ static Type *getGEPIndexTy(bool IsScalable, bool IsReverse,
 void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   unsigned CurrentPart = getUnrollPart(*this);
+  bool IsUnitStride = Stride == 1 || Stride == -1;
   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
-                                CurrentPart, Builder);
+                                IsUnitStride, CurrentPart, Builder);
 
   // The wide store needs to start at the last vector element.
   Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
   if (IndexTy != RunTimeVF->getType())
     RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
-  // NumElt = -CurrentPart * RunTimeVF
+  // NumElt = Stride * CurrentPart * RunTimeVF
   Value *NumElt = Builder.CreateMul(
-      ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
-  // LastLane = 1 - RunTimeVF
-  Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
+      ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
+  // LastLane = Stride * (RunTimeVF - 1)
+  Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
+  if (Stride != 1)
+    LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
   Value *Ptr = State.get(getOperand(0), VPLane(0));
   Value *ResultPtr =
       Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
@@ -2242,7 +2245,7 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   unsigned CurrentPart = getUnrollPart(*this);
   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
-                                CurrentPart, Builder);
+                                /*IsUnitStride*/ true, CurrentPart, Builder);
   Value *Ptr = State.get(getOperand(0), VPLane(0));
 
   Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6068b87663047..d5c59bbab9bed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2496,8 +2496,9 @@ void VPlanTransforms::createInterleaveGroups(
     if (IG->isReverse()) {
       auto *GEP = dyn_cast<GetElementPtrInst>(
           getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts());
-      auto *ReversePtr = new VPReverseInterleavePtrRecipe(
-          Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(),
+      auto *ReversePtr = new VPVectorEndPointerRecipe(
+          Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
+          -(int64_t)IG->getFactor(),
           GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
                                    : GEPNoWrapFlags::none(),
           InsertPos->getDebugLoc());
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 0333035a4b0bf..02a9d697eed50 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -367,10 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw i64 2, [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
@@ -381,10 +379,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP18:%.*]] = sub nsw i64 2, [[TMP15]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
@@ -1579,10 +1575,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = sub nsw i64 4, [[TMP6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
@@ -1599,10 +1593,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
 ; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2
-; CHECK-NEXT:    [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP25:%.*]] = sub nsw i64 4, [[TMP22]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
 ; CHECK-NEXT:    [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
 ; CHECK-NEXT:    [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index a28673cf8e552..1a68775e8d1f5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -40,7 +40,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
 ; RV64-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
 ; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
 ; RV64-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT:    [[TMP11:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT:    [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
 ; RV64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]]
 ; RV64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]]
 ; RV64-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
@@ -48,7 +49,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
 ; RV64-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
 ; RV64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; RV64-NEXT:    [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT:    [[TMP17:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT:    [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
 ; RV64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]]
 ; RV64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]]
 ; RV64-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
@@ -98,7 +100,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
 ; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
 ; RV32-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
 ; RV32-NEXT:    [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT:    [[TMP12:%.*]] = sub i32 1, [[TMP10]]
+; RV32-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP10]], 1
+; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
 ; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]]
 ; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]]
 ; RV32-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
@@ -107,7 +110,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
 ; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; RV32-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
 ; RV32-NEXT:    [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT:    [[TMP19:%.*]] = sub i32 1, [[TMP17]]
+; RV32-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP17]], 1
+; RV32-NEXT:    [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
 ; RV32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]]
 ; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]]
 ; RV32-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
@@ -157,11 +161,13 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-NEXT:    [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
 ; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]]
 ; RV64-UF2-NEXT:    [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP12:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
 ; RV64-UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]]
 ; RV64-UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]]
 ; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP16:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
 ; RV64-UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]]
 ; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
 ; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
@@ -172,11 +178,13 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
 ; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
 ; RV64-UF2-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
 ; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
 ; RV64-UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
 ; RV64-UF2-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP27:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
 ; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
 ; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
 ; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
@@ -246,7 +254,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
 ; RV64-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
 ; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
 ; RV64-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT:    [[TMP11:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT:    [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
 ; RV64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
 ; RV64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]]
 ; RV64-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
@@ -254,7 +263,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
 ; RV64-NEXT:    [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
 ; RV64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
 ; RV64-NEXT:    [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT:    [[TMP17:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT:    [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
 ; RV64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
 ; RV64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]]
 ; RV64-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]])
@@ -304,7 +314,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
 ; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
 ; RV32-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
 ; RV32-NEXT:    [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT:    [[TMP12:%.*]] = sub i32 1, [[TMP10]]
+; RV32-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP10]], 1
+; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
 ; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]]
 ; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]]
 ; RV32-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
@@ -313,7 +324,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
 ; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
 ; RV32-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
 ; RV32-NEXT:    [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT:    [[TMP19:%.*]] = sub i32 1, [[TMP17]]
+; RV32-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP17]], 1
+; RV32-NEXT:    [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
 ; RV32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]]
 ; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]]
 ; RV32-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]])
@@ -363,11 +375,13 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-NEXT:    [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
 ; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
 ; RV64-UF2-NEXT:    [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP12:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
 ; RV64-UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]]
 ; RV64-UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]]
 ; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP16:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
 ; RV64-UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]]
 ; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]]
 ; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
@@ -378,11 +392,13 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-NEXT:    [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
 ; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
 ; RV64-UF2-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
 ; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
 ; RV64-UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
 ; RV64-UF2-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP27:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
 ; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
 ; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
 ; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP19]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index ba4c4b6d58add..88b3000f374ec 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -334,22 +334,24 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %22 = zext i32 %21 to i64
 ; CHECK-NEXT:    %23 = getelementptr inbounds i32, ptr %B, i64 %22
 ; CHECK-NEXT:    %24 = mul i64 0, %18
-; CHECK-NEXT:    %25 = sub i64 1, %18
-; CHECK-NEXT:    %26 = getelementptr inbounds i32, ptr %23, i64 %24
-; CHECK-NEXT:    %27 = getelementptr inbounds i32, ptr %26, i64 %25
-; CHECK-NEXT:    %wide.load = load <vscale x 4 x i32>, ptr %27, align 4
+; CHECK-NEXT:    %25 = sub i64 %18, 1
+; CHECK-NEXT:    %26 = mul i64 -1, %25
+; CHECK-NEXT:    %27 = getelementptr inbounds i32, ptr %23, i64 %24
+; CHECK-NEXT:    %28 = getelementptr inbounds i32, ptr %27, i64 %26
+; CHECK-NEXT:    %wide.load = load <vscale x 4 x i32>, ptr %28, align 4
 ; CHECK-NEXT:    %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %wide.load)
-; CHECK-NEXT:    %28 = add <vscale x 4 x i32> %reverse, splat (i32 1)
-; CHECK-NEXT:    %29 = getelementptr inbounds i32, ptr %A, i64 %22
-; CHECK-NEXT:    %30 = mul i64 0, %18
-; CHECK-NEXT:    %31 = sub i64 1, %18
-; CHECK-NEXT:    %32 = getelementptr inbounds i32, ptr %29, i64 %30
-; CHECK-NEXT:    %33 = getelementptr inbounds i32, ptr %32, i64 %31
-; CHECK-NEXT:    %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %28)
-; CHECK-NEXT:    store <vscale x 4 x i32> %reverse4, ptr %33, align 4
+; CHECK-NEXT:    %29 = add <vscale x 4 x i32> %reverse, splat (i32 1)
+; CHECK-NEXT:    %30 = getelementptr inbounds i32, ptr %A, i64 %22
+; CHECK-NEXT:    %31 = mul i64 0, %18
+; CHECK-NEXT:    %32 = sub i64 %18, 1
+; CHECK-NEXT:    %33 = mul i64 -1, %32
+; CHECK-NEXT:    %34 = getelementptr inbounds i32, ptr %30, i64 %31
+; CHECK-NEXT:    %35 = getelementptr inbounds i32, ptr %34, i64 %33
+; CHECK-NEXT:    %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %29)
+; CHECK-NEXT:    store <vscale x 4 x i32> %reverse4, ptr %35, align 4
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, %18
-; CHECK-NEXT:    %34 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT:    br i1 %34, <null operand!>, label %vector.body
+; CHECK-NEXT:    %36 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1 %36, <null operand!>, label %vector.body
 ; CHECK-NEXT:  LV: created middle.block
 ; CHECK-NEXT:  LV: draw edge fromvector.body
 ; CHECK-NEXT:  LV: vectorizing VPBB:middle.block in BB:middle.block
@@ -380,8 +382,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:    %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:    %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:    %35 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT:    %add9 = add i32 %35, 1
+; CHECK-NEXT:    %37 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:    %add9 = add i32 %37, 1
 ; CHECK-NEXT:    %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
 ; CHECK-NEXT:    store i32 %add9, ptr %arrayidx3, align 4
 ; CHECK-NEXT:    %cmp = icmp ugt i64 %indvars.iv, 1
@@ -743,22 +745,24 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %22 = zext i32 %21 to i64
 ; CHECK-NEXT:    %23 = getelementptr inbounds float, ptr %B, i64 %22
 ; CHECK-NEXT:    %24 = mul i64 0, %18
-; CHECK-NEXT:    %25 = sub i64 1, %18
-; CHECK-NEXT:    %26 = getelementptr inbounds float, ptr %23, i64 %24
-; CHECK-NEXT:    %27 = getelementptr inbounds float, ptr %26, i64 %25
-; CHECK-NEXT:    %wide.load = load <vscale x 4 x float>, ptr %27, align 4
+; CHECK-NEXT:    %25 = sub i64 %18, 1
+; CHECK-NEXT:    %26 = mul i64 -1, %25
+; CHECK-NEXT:    %27 = getelementptr inbounds float, ptr %23, i64 %24
+; CHECK-NEXT:    %28 = getelementptr inbounds float, ptr %27, i64 %26
+; CHECK-NEXT:    %wide.load = load <vscale x 4 x float>, ptr %28, align 4
 ; CHECK-NEXT:    %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %wide.load)
-; CHECK-NEXT:    %28 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
-; CHECK-NEXT:    %29 = getelementptr inbounds float, ptr %A, i64 %22
-; CHECK-NEXT:    %30 = mul i64 0, %18
-; CHECK-NEXT:    %31 = sub i64 1, %18
-; CHECK-NEXT:    %32 = getelementptr inbounds float, ptr %29, i64 %30
-; CHECK-NEXT:    %33 = getelementptr inbounds float, ptr %32, i64 %31
-; CHECK-NEXT:    %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %28)
-; CHECK-NEXT:    store <vscale x 4 x float> %reverse4, ptr %33, align 4
+; CHECK-NEXT:    %29 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
+; CHECK-NEXT:    %30 = getelementptr inbounds float, ptr %A, i64 %22
+; CHECK-NEXT:    %31 = mul i64 0, %18
+; CHECK-NEXT:    %32 = sub i64 %18, 1
+; CHECK-NEXT:    %33 = mul i64 -1, %32
+; CHECK-NEXT:    %34 = getelementptr inbounds float, ptr %30, i64 %31
+; CHECK-NEXT:    %35 = getelementptr inbounds float, ptr %34, i64 %33
+; CHECK-NEXT:    %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %29)
+; CHECK-NEXT:    store <vscale x 4 x float> %reverse4, ptr %35, align 4
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, %18
-; CHECK-NEXT:    %34 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT:    br i1 %34, <null operand!>, label %vector.body
+; CHECK-NEXT:    %36 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1 %36, <null operand!>, label %vector.body
 ; CHECK-NEXT:  LV: created middle.block
 ; CHECK-NEXT:  LV: draw edge fromvector.body
 ; CHECK-NEXT:  LV: vectorizing VPBB:middle.block in BB:middle.block
@@ -789,8 +793,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:    %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:    %35 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:    %conv1 = fadd float %35, 1.000000e+00
+; CHECK-NEXT:    %37 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:    %conv1 = fadd float %37, 1.000000e+00
 ; CHECK-NEXT:    %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
 ; CHECK-NEXT:    store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:    %cmp = icmp ugt i64 %indvars.iv, 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 4d8166eaa46f1..b01eb8b8332e3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -33,7 +33,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 0, [[TMP18]]
-; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 1, [[TMP18]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP18]], 1
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 -1, [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -41,7 +42,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 0, [[TMP19]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = sub i64 1, [[TMP19]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP19]], 1
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 -1, [[TMP23]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
 ; IF-EVL-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -136,7 +138,8 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = mul i64 0, [[TMP26]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = sub i64 1, [[TMP26]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = sub i64 [[TMP26]], 1
+; IF-EVL-NEXT:    [[TMP18:%.*]] = mul i64 -1, [[TMP15]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
 ; IF-EVL-NEXT:    [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -145,7 +148,8 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP27]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP27]]
+; IF-EVL-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP27]], 1
+; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 -1, [[TMP30]]
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
 ; IF-EVL-NEXT:    [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -261,7 +265,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP9]]
-; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 1, [[TMP9]]
+; IF-EVL-NEXT:    [[TMP29:%.*]] = sub i64 [[TMP9]], 1
+; IF-EVL-NEXT:    [[TMP11:%.*]] = mul i64 -1, [[TMP29]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
@@ -271,7 +276,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP6]] to i64
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = mul i64 0, [[TMP16]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = sub i64 1, [[TMP16]]
+; IF-EVL-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP16]], 1
+; IF-EVL-NEXT:    [[TMP18:%.*]] = mul i64 -1, [[TMP30]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
 ; IF-EVL-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
@@ -279,7 +285,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[OFFSET_IDX]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 0, [[TMP22]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = sub i64 1, [[TMP22]]
+; IF-EVL-NEXT:    [[TMP31:%.*]] = sub i64 [[TMP22]], 1
+; IF-EVL-NEXT:    [[TMP24:%.*]] = mul i64 -1, [[TMP31]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
 ; IF-EVL-NEXT:    [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 82e8d3d6c611a..a09e6c8b856e6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -38,7 +38,8 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 0, [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = sub i64 1, [[TMP15]]
+; CHECK-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])

>From 2f9631f9275ec38a623e75c38de1b5de44eb2170 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 24 Jun 2025 02:20:10 -0700
Subject: [PATCH 3/4] Remove VPReverseInterleavePtrRecipe

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  1 -
 llvm/lib/Transforms/Vectorize/VPlan.h         | 49 -------------------
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  7 ++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 28 -----------
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |  1 -
 5 files changed, 3 insertions(+), 83 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 38b63bcf648e2..a768c326f2941 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4256,7 +4256,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       case VPDef::VPDerivedIVSC:
       case VPDef::VPScalarIVStepsSC:
       case VPDef::VPReplicateSC:
-      case VPDef::VPReverseInterleavePtrSC:
       case VPDef::VPInstructionSC:
       case VPDef::VPCanonicalIVPHISC:
       case VPDef::VPVectorPointerSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5b487f301707c..0be361f0f5c9a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -531,7 +531,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionEVLSC:
     case VPRecipeBase::VPReductionSC:
-    case VPRecipeBase::VPReverseInterleavePtrSC:
     case VPRecipeBase::VPMulAccumulateReductionSC:
     case VPRecipeBase::VPExtendedReductionSC:
     case VPRecipeBase::VPReplicateSC:
@@ -852,7 +851,6 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
            R->getVPDefID() == VPRecipeBase::VPReductionSC ||
            R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
            R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
-           R->getVPDefID() == VPRecipeBase::VPReverseInterleavePtrSC ||
            R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
            R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
            R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
@@ -1803,53 +1801,6 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
 #endif
 };
 
-class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags {
-  Type *IndexedTy;
-  unsigned Factor;
-
-public:
-  VPReverseInterleavePtrRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
-                               unsigned Factor, GEPNoWrapFlags GEPFlags,
-                               DebugLoc DL)
-      : VPRecipeWithIRFlags(VPDef::VPReverseInterleavePtrSC,
-                            ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
-        IndexedTy(IndexedTy), Factor(Factor) {
-    assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
-  }
-
-  VP_CLASSOF_IMPL(VPDef::VPReverseInterleavePtrSC)
-
-  VPValue *getPtr() const { return getOperand(0); }
-
-  VPValue *getVFValue() const { return getOperand(1); }
-
-  void execute(VPTransformState &State) override;
-
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
-    assert(is_contained(operands(), Op) &&
-           "Op must be an operand of the recipe");
-    return true;
-  }
-
-  InstructionCost computeCost(ElementCount VF,
-                              VPCostContext &Ctx) const override {
-    // TODO: Compute accurate cost after retiring the legacy cost model.
-    return 0;
-  }
-
-  VPReverseInterleavePtrRecipe *clone() override {
-    return new VPReverseInterleavePtrRecipe(getPtr(), getVFValue(), IndexedTy,
-                                            Factor, getGEPNoWrapFlags(),
-                                            getDebugLoc());
-  }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-};
-
 /// A pure virtual base class for all recipes modeling header phis, including
 /// phis for first order recurrences, pointer inductions and reductions. The
 /// start value is the first operand of the recipe and the incoming value from
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 98889cb5c520c..76da5b0314a8e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -282,10 +282,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
                 VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
                 VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
-                VPPartialReductionRecipe, VPReverseInterleavePtrRecipe>(
-              [this](const VPRecipeBase *R) {
-                return inferScalarType(R->getOperand(0));
-              })
+                VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
+            return inferScalarType(R->getOperand(0));
+          })
           // VPInstructionWithType must be handled before VPInstruction.
           .Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
                 VPWidenCastRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3f70111947dbd..abe473199ee6b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -150,7 +150,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPDerivedIVSC:
   case VPFirstOrderRecurrencePHISC:
   case VPPredInstPHISC:
-  case VPReverseInterleavePtrSC:
   case VPVectorEndPointerSC:
     return false;
   case VPInstructionSC:
@@ -2266,33 +2265,6 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-void VPReverseInterleavePtrRecipe::execute(VPTransformState &State) {
-  auto &Builder = State.Builder;
-  Value *Ptr = State.get(getPtr(), /*IsScalar*/ true);
-  Value *RuntimeVF = State.get(getVFValue(), /*IsScalar*/ true);
-  Type *IndexTy = Builder.getInt32Ty();
-  if (RuntimeVF->getType() != IndexTy)
-    RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, IndexTy);
-  Value *Index = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
-  Index = Builder.CreateMul(Index, Builder.getInt32(Factor));
-  Index = Builder.CreateNeg(Index);
-  Value *ReversePtr =
-      Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
-
-  State.set(this, ReversePtr, /*IsScalar*/ true);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPReverseInterleavePtrRecipe::print(raw_ostream &O, const Twine &Indent,
-                                         VPSlotTracker &SlotTracker) const {
-  O << Indent;
-  printAsOperand(O, SlotTracker);
-  O << " = reverse-interleave-ptr";
-  printFlags(O);
-  printOperands(O, SlotTracker);
-}
-#endif
-
 void VPBlendRecipe::execute(VPTransformState &State) {
   assert(isNormalized() && "Expected blend to be normalized!");
   // We know that all PHIs in non-header blocks are converted into
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 83f6ac223af1e..a0d3dc9b934cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -335,7 +335,6 @@ class VPDef {
     VPInterleaveSC,
     VPReductionEVLSC,
     VPReductionSC,
-    VPReverseInterleavePtrSC,
     VPMulAccumulateReductionSC,
     VPExtendedReductionSC,
     VPPartialReductionSC,

>From b50ca85fd38494b572f3e236719475ec054a0670 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 24 Jun 2025 02:22:38 -0700
Subject: [PATCH 4/4] comment

---
 llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0be361f0f5c9a..3eefb4815060c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1695,7 +1695,7 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
 
 /// A recipe to compute a pointer to the last element of each part of a widened
 /// memory access for widened memory accesses of IndexedTy. Used for
-/// VPWidenMemoryRecipes that are reversed.
+/// VPWidenMemoryRecipes or VPInterleaveRecipes that are reversed.
 class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
                                  public VPUnrollPartAccessor<2> {
   Type *IndexedTy;