[llvm] [VPlan] Extract reverse interleave pointer adjustment into VPReverseInterleavePtrRecipe (PR #144864)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 24 02:23:08 PDT 2025
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/144864
>From 0f530f85945d2a07fd1d059436fed10522e449de Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 19 Jun 2025 01:44:28 -0700
Subject: [PATCH 1/4] first step, Interleave accesses for EVL tail folding. POC
---
.../Transforms/Vectorize/LoopVectorize.cpp | 1 +
llvm/lib/Transforms/Vectorize/VPlan.h | 49 +++++++++++++++++++
.../Transforms/Vectorize/VPlanAnalysis.cpp | 7 +--
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 47 +++++++++++-------
.../Transforms/Vectorize/VPlanTransforms.cpp | 15 ++++++
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 +
.../AArch64/sve-interleaved-accesses.ll | 16 +++---
7 files changed, 106 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f887b34e76422..ce40c6ccba92e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4256,6 +4256,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPDerivedIVSC:
case VPDef::VPScalarIVStepsSC:
case VPDef::VPReplicateSC:
+ case VPDef::VPReverseInterleavePtrSC:
case VPDef::VPInstructionSC:
case VPDef::VPCanonicalIVPHISC:
case VPDef::VPVectorPointerSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f3306ad7cb8ec..daef26fe86d79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -531,6 +531,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
+ case VPRecipeBase::VPReverseInterleavePtrSC:
case VPRecipeBase::VPMulAccumulateReductionSC:
case VPRecipeBase::VPExtendedReductionSC:
case VPRecipeBase::VPReplicateSC:
@@ -851,6 +852,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
R->getVPDefID() == VPRecipeBase::VPReductionSC ||
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
+ R->getVPDefID() == VPRecipeBase::VPReverseInterleavePtrSC ||
R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
@@ -1796,6 +1798,53 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
#endif
};
+class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags {
+ Type *IndexedTy;
+ unsigned Factor;
+
+public:
+ VPReverseInterleavePtrRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
+ unsigned Factor, GEPNoWrapFlags GEPFlags,
+ DebugLoc DL)
+ : VPRecipeWithIRFlags(VPDef::VPReverseInterleavePtrSC,
+ ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
+ IndexedTy(IndexedTy), Factor(Factor) {
+ assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPReverseInterleavePtrSC)
+
+ VPValue *getPtr() const { return getOperand(0); }
+
+ VPValue *getVFValue() const { return getOperand(1); }
+
+ void execute(VPTransformState &State) override;
+
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
+
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override {
+ // TODO: Compute accurate cost after retiring the legacy cost model.
+ return 0;
+ }
+
+ VPReverseInterleavePtrRecipe *clone() override {
+ return new VPReverseInterleavePtrRecipe(getPtr(), getVFValue(), IndexedTy,
+ Factor, getGEPNoWrapFlags(),
+ getDebugLoc());
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A pure virtual base class for all recipes modeling header phis, including
/// phis for first order recurrences, pointer inductions and reductions. The
/// start value is the first operand of the recipe and the incoming value from
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 76da5b0314a8e..98889cb5c520c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -282,9 +282,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
- VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
- return inferScalarType(R->getOperand(0));
- })
+ VPPartialReductionRecipe, VPReverseInterleavePtrRecipe>(
+ [this](const VPRecipeBase *R) {
+ return inferScalarType(R->getOperand(0));
+ })
// VPInstructionWithType must be handled before VPInstruction.
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
VPWidenCastRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1ed0b97849a8d..40dde8cfaea73 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -150,6 +150,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPPredInstPHISC:
+ case VPReverseInterleavePtrSC:
case VPVectorEndPointerSC:
return false;
case VPInstructionSC:
@@ -2262,6 +2263,33 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPReverseInterleavePtrRecipe::execute(VPTransformState &State) {
+ auto &Builder = State.Builder;
+ Value *Ptr = State.get(getPtr(), /*IsScalar*/ true);
+ Value *RuntimeVF = State.get(getVFValue(), /*IsScalar*/ true);
+ Type *IndexTy = Builder.getInt32Ty();
+ if (RuntimeVF->getType() != IndexTy)
+ RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, IndexTy);
+ Value *Index = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
+ Index = Builder.CreateMul(Index, Builder.getInt32(Factor));
+ Index = Builder.CreateNeg(Index);
+ Value *ReversePtr =
+ Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
+
+ State.set(this, ReversePtr, /*IsScalar*/ true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPReverseInterleavePtrRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent;
+ printAsOperand(O, SlotTracker);
+ O << " = reverse-interleave-ptr";
+ printFlags(O);
+ printOperands(O, SlotTracker);
+}
+#endif
+
void VPBlendRecipe::execute(VPTransformState &State) {
assert(isNormalized() && "Expected blend to be normalized!");
// We know that all PHIs in non-header blocks are converted into
@@ -3223,25 +3251,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
if (auto *I = dyn_cast<Instruction>(ResAddr))
State.setDebugLocFrom(I->getDebugLoc());
- // If the group is reverse, adjust the index to refer to the last vector lane
- // instead of the first. We adjust the index from the first vector lane,
- // rather than directly getting the pointer for lane VF - 1, because the
- // pointer operand of the interleaved access is supposed to be uniform.
- if (Group->isReverse()) {
- Value *RuntimeVF =
- getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
- Value *Index =
- State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
- Index = State.Builder.CreateMul(Index,
- State.Builder.getInt32(Group->getFactor()));
- Index = State.Builder.CreateNeg(Index);
-
- bool InBounds = false;
- if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
- InBounds = Gep->isInBounds();
- ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
- }
-
State.setDebugLocFrom(getDebugLoc());
Value *PoisonVec = PoisonValue::get(VecTy);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 11f0f2a930329..6068b87663047 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2489,6 +2489,21 @@ void VPlanTransforms::createInterleaveGroups(
Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
: B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
}
+ // If the group is reverse, adjust the index to refer to the last vector
+ // lane instead of the first. We adjust the index from the first vector
+ // lane, rather than directly getting the pointer for lane VF - 1, because
+ // the pointer operand of the interleaved access is supposed to be uniform.
+ if (IG->isReverse()) {
+ auto *GEP = dyn_cast<GetElementPtrInst>(
+ getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts());
+ auto *ReversePtr = new VPReverseInterleavePtrRecipe(
+ Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(),
+ GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
+ : GEPNoWrapFlags::none(),
+ InsertPos->getDebugLoc());
+ ReversePtr->insertBefore(InsertPos);
+ Addr = ReversePtr;
+ }
auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc());
VPIG->insertBefore(InsertPos);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index a0d3dc9b934cc..83f6ac223af1e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -335,6 +335,7 @@ class VPDef {
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
+ VPReverseInterleavePtrSC,
VPMulAccumulateReductionSC,
VPExtendedReductionSC,
VPPartialReductionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 7e4edf739695a..0333035a4b0bf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -367,8 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 3
+; CHECK-NEXT: [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1
; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
@@ -381,8 +381,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1
; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
@@ -1579,8 +1579,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2
; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
@@ -1599,8 +1599,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
+; CHECK-NEXT: [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2
; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
>From 6270d1ad02363fb307a90a29f944e3621cf5ea1c Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 20 Jun 2025 01:44:32 -0700
Subject: [PATCH 2/4] Stride VectorEndPointer for reverse interleaved access
---
.../Transforms/Vectorize/LoopVectorize.cpp | 5 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 11 ++-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 19 +++---
.../Transforms/Vectorize/VPlanTransforms.cpp | 5 +-
.../AArch64/sve-interleaved-accesses.ll | 24 +++----
.../RISCV/riscv-vector-reverse-output.ll | 48 ++++++++-----
.../RISCV/riscv-vector-reverse.ll | 68 ++++++++++---------
...-force-tail-with-evl-reverse-load-store.ll | 21 ++++--
...orize-force-tail-with-evl-uniform-store.ll | 3 +-
9 files changed, 117 insertions(+), 87 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ce40c6ccba92e..38b63bcf648e2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7767,8 +7767,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
(CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
? GEPNoWrapFlags::none()
: GEPNoWrapFlags::inBounds();
- VectorPtr = new VPVectorEndPointerRecipe(
- Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
+ VectorPtr =
+ new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
+ /*Stride*/ -1, Flags, I->getDebugLoc());
} else {
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
GEP ? GEP->getNoWrapFlags()
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index daef26fe86d79..5b487f301707c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1702,12 +1702,16 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<2> {
Type *IndexedTy;
+ int64_t Stride;
+
public:
VPVectorEndPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
- GEPNoWrapFlags GEPFlags, DebugLoc DL)
+ int64_t Stride, GEPNoWrapFlags GEPFlags, DebugLoc DL)
: VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC,
ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
- IndexedTy(IndexedTy) {}
+ IndexedTy(IndexedTy), Stride(Stride) {
+ assert(Stride != 0 && "Unexpected stride");
+ }
VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC)
@@ -1739,7 +1743,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
VPVectorEndPointerRecipe *clone() override {
return new VPVectorEndPointerRecipe(getOperand(0), getVFValue(), IndexedTy,
- getGEPNoWrapFlags(), getDebugLoc());
+ Stride, getGEPNoWrapFlags(),
+ getDebugLoc());
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 40dde8cfaea73..3f70111947dbd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2193,12 +2193,12 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-static Type *getGEPIndexTy(bool IsScalable, bool IsReverse,
+static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
unsigned CurrentPart, IRBuilderBase &Builder) {
// Use i32 for the gep index type when the value is constant,
// or query DataLayout for a more suitable index type otherwise.
const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
- return IsScalable && (IsReverse || CurrentPart > 0)
+ return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
? DL.getIndexType(Builder.getPtrTy(0))
: Builder.getInt32Ty();
}
@@ -2206,18 +2206,21 @@ static Type *getGEPIndexTy(bool IsScalable, bool IsReverse,
void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
unsigned CurrentPart = getUnrollPart(*this);
+ bool IsUnitStride = Stride == 1 || Stride == -1;
Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
- CurrentPart, Builder);
+ IsUnitStride, CurrentPart, Builder);
// The wide store needs to start at the last vector element.
Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
if (IndexTy != RunTimeVF->getType())
RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
- // NumElt = -CurrentPart * RunTimeVF
+ // NumElt = Stride * CurrentPart * RunTimeVF
Value *NumElt = Builder.CreateMul(
- ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
- // LastLane = 1 - RunTimeVF
- Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
+ ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
+ // LastLane = Stride * (RunTimeVF - 1)
+ Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
+ if (Stride != 1)
+ LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
Value *Ptr = State.get(getOperand(0), VPLane(0));
Value *ResultPtr =
Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
@@ -2242,7 +2245,7 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
unsigned CurrentPart = getUnrollPart(*this);
Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
- CurrentPart, Builder);
+ /*IsUnitStride*/ true, CurrentPart, Builder);
Value *Ptr = State.get(getOperand(0), VPLane(0));
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6068b87663047..d5c59bbab9bed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2496,8 +2496,9 @@ void VPlanTransforms::createInterleaveGroups(
if (IG->isReverse()) {
auto *GEP = dyn_cast<GetElementPtrInst>(
getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts());
- auto *ReversePtr = new VPReverseInterleavePtrRecipe(
- Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(),
+ auto *ReversePtr = new VPVectorEndPointerRecipe(
+ Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
+ -(int64_t)IG->getFactor(),
GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
: GEPNoWrapFlags::none(),
InsertPos->getDebugLoc());
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 0333035a4b0bf..02a9d697eed50 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -367,10 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1
-; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i64 2, [[TMP6]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
@@ -381,10 +379,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1
-; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = sub nsw i64 2, [[TMP15]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
; CHECK-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
; CHECK-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
@@ -1579,10 +1575,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2
-; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP9:%.*]] = sub nsw i64 4, [[TMP6]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
@@ -1599,10 +1593,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2
-; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
-; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP25:%.*]] = sub nsw i64 4, [[TMP22]]
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
; CHECK-NEXT: [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
; CHECK-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index a28673cf8e552..1a68775e8d1f5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -40,7 +40,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]]
; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]]
; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
@@ -48,7 +49,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]]
; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]]
; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
@@ -98,7 +100,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]]
+; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1
+; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]]
; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]]
; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
@@ -107,7 +110,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]]
+; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1
+; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]]
; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]]
; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
@@ -157,11 +161,13 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]]
; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]]
; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]]
; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]]
; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
@@ -172,11 +178,13 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
@@ -246,7 +254,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]]
; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
@@ -254,7 +263,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]]
; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]])
@@ -304,7 +314,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]]
+; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1
+; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]]
; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]]
; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
@@ -313,7 +324,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]]
+; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1
+; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]]
; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]]
; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]])
@@ -363,11 +375,13 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]]
; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]]
; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]]
; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]]
; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
@@ -378,11 +392,13 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP19]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index ba4c4b6d58add..88b3000f374ec 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -334,22 +334,24 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %22 = zext i32 %21 to i64
; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22
; CHECK-NEXT: %24 = mul i64 0, %18
-; CHECK-NEXT: %25 = sub i64 1, %18
-; CHECK-NEXT: %26 = getelementptr inbounds i32, ptr %23, i64 %24
-; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %26, i64 %25
-; CHECK-NEXT: %wide.load = load <vscale x 4 x i32>, ptr %27, align 4
+; CHECK-NEXT: %25 = sub i64 %18, 1
+; CHECK-NEXT: %26 = mul i64 -1, %25
+; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %23, i64 %24
+; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %27, i64 %26
+; CHECK-NEXT: %wide.load = load <vscale x 4 x i32>, ptr %28, align 4
; CHECK-NEXT: %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %wide.load)
-; CHECK-NEXT: %28 = add <vscale x 4 x i32> %reverse, splat (i32 1)
-; CHECK-NEXT: %29 = getelementptr inbounds i32, ptr %A, i64 %22
-; CHECK-NEXT: %30 = mul i64 0, %18
-; CHECK-NEXT: %31 = sub i64 1, %18
-; CHECK-NEXT: %32 = getelementptr inbounds i32, ptr %29, i64 %30
-; CHECK-NEXT: %33 = getelementptr inbounds i32, ptr %32, i64 %31
-; CHECK-NEXT: %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %28)
-; CHECK-NEXT: store <vscale x 4 x i32> %reverse4, ptr %33, align 4
+; CHECK-NEXT: %29 = add <vscale x 4 x i32> %reverse, splat (i32 1)
+; CHECK-NEXT: %30 = getelementptr inbounds i32, ptr %A, i64 %22
+; CHECK-NEXT: %31 = mul i64 0, %18
+; CHECK-NEXT: %32 = sub i64 %18, 1
+; CHECK-NEXT: %33 = mul i64 -1, %32
+; CHECK-NEXT: %34 = getelementptr inbounds i32, ptr %30, i64 %31
+; CHECK-NEXT: %35 = getelementptr inbounds i32, ptr %34, i64 %33
+; CHECK-NEXT: %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %29)
+; CHECK-NEXT: store <vscale x 4 x i32> %reverse4, ptr %35, align 4
; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %34 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %34, <null operand!>, label %vector.body
+; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1 %36, <null operand!>, label %vector.body
; CHECK-NEXT: LV: created middle.block
; CHECK-NEXT: LV: draw edge fromvector.body
; CHECK-NEXT: LV: vectorizing VPBB:middle.block in BB:middle.block
@@ -380,8 +382,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: %35 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT: %add9 = add i32 %35, 1
+; CHECK-NEXT: %37 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: %add9 = add i32 %37, 1
; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4
; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
@@ -743,22 +745,24 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %22 = zext i32 %21 to i64
; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22
; CHECK-NEXT: %24 = mul i64 0, %18
-; CHECK-NEXT: %25 = sub i64 1, %18
-; CHECK-NEXT: %26 = getelementptr inbounds float, ptr %23, i64 %24
-; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %26, i64 %25
-; CHECK-NEXT: %wide.load = load <vscale x 4 x float>, ptr %27, align 4
+; CHECK-NEXT: %25 = sub i64 %18, 1
+; CHECK-NEXT: %26 = mul i64 -1, %25
+; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %23, i64 %24
+; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %27, i64 %26
+; CHECK-NEXT: %wide.load = load <vscale x 4 x float>, ptr %28, align 4
; CHECK-NEXT: %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %wide.load)
-; CHECK-NEXT: %28 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
-; CHECK-NEXT: %29 = getelementptr inbounds float, ptr %A, i64 %22
-; CHECK-NEXT: %30 = mul i64 0, %18
-; CHECK-NEXT: %31 = sub i64 1, %18
-; CHECK-NEXT: %32 = getelementptr inbounds float, ptr %29, i64 %30
-; CHECK-NEXT: %33 = getelementptr inbounds float, ptr %32, i64 %31
-; CHECK-NEXT: %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %28)
-; CHECK-NEXT: store <vscale x 4 x float> %reverse4, ptr %33, align 4
+; CHECK-NEXT: %29 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
+; CHECK-NEXT: %30 = getelementptr inbounds float, ptr %A, i64 %22
+; CHECK-NEXT: %31 = mul i64 0, %18
+; CHECK-NEXT: %32 = sub i64 %18, 1
+; CHECK-NEXT: %33 = mul i64 -1, %32
+; CHECK-NEXT: %34 = getelementptr inbounds float, ptr %30, i64 %31
+; CHECK-NEXT: %35 = getelementptr inbounds float, ptr %34, i64 %33
+; CHECK-NEXT: %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %29)
+; CHECK-NEXT: store <vscale x 4 x float> %reverse4, ptr %35, align 4
; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %34 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %34, <null operand!>, label %vector.body
+; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1 %36, <null operand!>, label %vector.body
; CHECK-NEXT: LV: created middle.block
; CHECK-NEXT: LV: draw edge fromvector.body
; CHECK-NEXT: LV: vectorizing VPBB:middle.block in BB:middle.block
@@ -789,8 +793,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: %35 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT: %conv1 = fadd float %35, 1.000000e+00
+; CHECK-NEXT: %37 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: %conv1 = fadd float %37, 1.000000e+00
; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4
; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 4d8166eaa46f1..b01eb8b8332e3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -33,7 +33,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]]
-; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[TMP18]], 1
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 -1, [[TMP11]]
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -41,7 +42,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
-; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]]
+; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 [[TMP19]], 1
+; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP23]]
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -136,7 +138,8 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 [[TMP26]], 1
+; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP15]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -145,7 +148,8 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
-; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]]
+; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP27]], 1
+; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP30]]
; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -261,7 +265,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
; IF-EVL-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP9]]
-; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]]
+; IF-EVL-NEXT: [[TMP29:%.*]] = sub i64 [[TMP9]], 1
+; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP29]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
@@ -271,7 +276,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]]
; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]]
+; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP16]], 1
+; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP30]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
@@ -279,7 +285,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[OFFSET_IDX]]
; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]]
-; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]]
+; IF-EVL-NEXT: [[TMP31:%.*]] = sub i64 [[TMP22]], 1
+; IF-EVL-NEXT: [[TMP24:%.*]] = mul i64 -1, [[TMP31]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 82e8d3d6c611a..a09e6c8b856e6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -38,7 +38,8 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP15]]
+; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[TMP15]], 1
+; CHECK-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP16]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]]
; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
>From 2f9631f9275ec38a623e75c38de1b5de44eb2170 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 24 Jun 2025 02:20:10 -0700
Subject: [PATCH 3/4] Remove VPReverseInterleavePtrRecipe
---
.../Transforms/Vectorize/LoopVectorize.cpp | 1 -
llvm/lib/Transforms/Vectorize/VPlan.h | 49 -------------------
.../Transforms/Vectorize/VPlanAnalysis.cpp | 7 ++-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 28 -----------
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 -
5 files changed, 3 insertions(+), 83 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 38b63bcf648e2..a768c326f2941 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4256,7 +4256,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPDerivedIVSC:
case VPDef::VPScalarIVStepsSC:
case VPDef::VPReplicateSC:
- case VPDef::VPReverseInterleavePtrSC:
case VPDef::VPInstructionSC:
case VPDef::VPCanonicalIVPHISC:
case VPDef::VPVectorPointerSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5b487f301707c..0be361f0f5c9a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -531,7 +531,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
- case VPRecipeBase::VPReverseInterleavePtrSC:
case VPRecipeBase::VPMulAccumulateReductionSC:
case VPRecipeBase::VPExtendedReductionSC:
case VPRecipeBase::VPReplicateSC:
@@ -852,7 +851,6 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
R->getVPDefID() == VPRecipeBase::VPReductionSC ||
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
- R->getVPDefID() == VPRecipeBase::VPReverseInterleavePtrSC ||
R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
@@ -1803,53 +1801,6 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
#endif
};
-class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags {
- Type *IndexedTy;
- unsigned Factor;
-
-public:
- VPReverseInterleavePtrRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
- unsigned Factor, GEPNoWrapFlags GEPFlags,
- DebugLoc DL)
- : VPRecipeWithIRFlags(VPDef::VPReverseInterleavePtrSC,
- ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
- IndexedTy(IndexedTy), Factor(Factor) {
- assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
- }
-
- VP_CLASSOF_IMPL(VPDef::VPReverseInterleavePtrSC)
-
- VPValue *getPtr() const { return getOperand(0); }
-
- VPValue *getVFValue() const { return getOperand(1); }
-
- void execute(VPTransformState &State) override;
-
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
- assert(is_contained(operands(), Op) &&
- "Op must be an operand of the recipe");
- return true;
- }
-
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override {
- // TODO: Compute accurate cost after retiring the legacy cost model.
- return 0;
- }
-
- VPReverseInterleavePtrRecipe *clone() override {
- return new VPReverseInterleavePtrRecipe(getPtr(), getVFValue(), IndexedTy,
- Factor, getGEPNoWrapFlags(),
- getDebugLoc());
- }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-#endif
-};
-
/// A pure virtual base class for all recipes modeling header phis, including
/// phis for first order recurrences, pointer inductions and reductions. The
/// start value is the first operand of the recipe and the incoming value from
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 98889cb5c520c..76da5b0314a8e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -282,10 +282,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
- VPPartialReductionRecipe, VPReverseInterleavePtrRecipe>(
- [this](const VPRecipeBase *R) {
- return inferScalarType(R->getOperand(0));
- })
+ VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
+ return inferScalarType(R->getOperand(0));
+ })
// VPInstructionWithType must be handled before VPInstruction.
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
VPWidenCastRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3f70111947dbd..abe473199ee6b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -150,7 +150,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPPredInstPHISC:
- case VPReverseInterleavePtrSC:
case VPVectorEndPointerSC:
return false;
case VPInstructionSC:
@@ -2266,33 +2265,6 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-void VPReverseInterleavePtrRecipe::execute(VPTransformState &State) {
- auto &Builder = State.Builder;
- Value *Ptr = State.get(getPtr(), /*IsScalar*/ true);
- Value *RuntimeVF = State.get(getVFValue(), /*IsScalar*/ true);
- Type *IndexTy = Builder.getInt32Ty();
- if (RuntimeVF->getType() != IndexTy)
- RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, IndexTy);
- Value *Index = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
- Index = Builder.CreateMul(Index, Builder.getInt32(Factor));
- Index = Builder.CreateNeg(Index);
- Value *ReversePtr =
- Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
-
- State.set(this, ReversePtr, /*IsScalar*/ true);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPReverseInterleavePtrRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent;
- printAsOperand(O, SlotTracker);
- O << " = reverse-interleave-ptr";
- printFlags(O);
- printOperands(O, SlotTracker);
-}
-#endif
-
void VPBlendRecipe::execute(VPTransformState &State) {
assert(isNormalized() && "Expected blend to be normalized!");
// We know that all PHIs in non-header blocks are converted into
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 83f6ac223af1e..a0d3dc9b934cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -335,7 +335,6 @@ class VPDef {
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
- VPReverseInterleavePtrSC,
VPMulAccumulateReductionSC,
VPExtendedReductionSC,
VPPartialReductionSC,
>From b50ca85fd38494b572f3e236719475ec054a0670 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 24 Jun 2025 02:22:38 -0700
Subject: [PATCH 4/4] comment
---
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0be361f0f5c9a..3eefb4815060c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1695,7 +1695,7 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
/// A recipe to compute a pointer to the last element of each part of a widened
/// memory access for widened memory accesses of IndexedTy. Used for
-/// VPWidenMemoryRecipes that are reversed.
+/// VPWidenMemoryRecipes or VPInterleaveRecipes that are reversed.
class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<2> {
Type *IndexedTy;
More information about the llvm-commits
mailing list