[llvm] [VPlan] Emit VPVectorEndPointerRecipe for reverse interleave pointer adjustment (PR #144864)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 30 23:53:14 PDT 2025
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/144864
>From 4175f87f81b66489fb2aa684e22d03a35f5cf664 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 19 Jun 2025 01:44:28 -0700
Subject: [PATCH 01/10] first step, Interleave accesses for EVL tail folding.
POC
---
.../Transforms/Vectorize/LoopVectorize.cpp | 1 +
llvm/lib/Transforms/Vectorize/VPlan.h | 49 +++++++++++++++++++
.../Transforms/Vectorize/VPlanAnalysis.cpp | 7 +--
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 47 +++++++++++-------
.../Transforms/Vectorize/VPlanTransforms.cpp | 15 ++++++
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 +
.../AArch64/sve-interleaved-accesses.ll | 16 +++---
7 files changed, 106 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b01c8b02ec66a..fed1ac5552a1e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4257,6 +4257,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPDerivedIVSC:
case VPDef::VPScalarIVStepsSC:
case VPDef::VPReplicateSC:
+ case VPDef::VPReverseInterleavePtrSC:
case VPDef::VPInstructionSC:
case VPDef::VPCanonicalIVPHISC:
case VPDef::VPVectorPointerSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 61b5ccd85bc6e..604c14a4293bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -531,6 +531,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
+ case VPRecipeBase::VPReverseInterleavePtrSC:
case VPRecipeBase::VPMulAccumulateReductionSC:
case VPRecipeBase::VPExtendedReductionSC:
case VPRecipeBase::VPReplicateSC:
@@ -851,6 +852,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
R->getVPDefID() == VPRecipeBase::VPReductionSC ||
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
+ R->getVPDefID() == VPRecipeBase::VPReverseInterleavePtrSC ||
R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
@@ -1807,6 +1809,53 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
#endif
};
+class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags {
+ Type *IndexedTy;
+ unsigned Factor;
+
+public:
+ VPReverseInterleavePtrRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
+ unsigned Factor, GEPNoWrapFlags GEPFlags,
+ DebugLoc DL)
+ : VPRecipeWithIRFlags(VPDef::VPReverseInterleavePtrSC,
+ ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
+ IndexedTy(IndexedTy), Factor(Factor) {
+ assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPReverseInterleavePtrSC)
+
+ VPValue *getPtr() const { return getOperand(0); }
+
+ VPValue *getVFValue() const { return getOperand(1); }
+
+ void execute(VPTransformState &State) override;
+
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
+
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override {
+ // TODO: Compute accurate cost after retiring the legacy cost model.
+ return 0;
+ }
+
+ VPReverseInterleavePtrRecipe *clone() override {
+ return new VPReverseInterleavePtrRecipe(getPtr(), getVFValue(), IndexedTy,
+ Factor, getGEPNoWrapFlags(),
+ getDebugLoc());
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A pure virtual base class for all recipes modeling header phis, including
/// phis for first order recurrences, pointer inductions and reductions. The
/// start value is the first operand of the recipe and the incoming value from
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f3b99fe34c069..c45bc743c9fc7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -283,9 +283,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
- VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
- return inferScalarType(R->getOperand(0));
- })
+ VPPartialReductionRecipe, VPReverseInterleavePtrRecipe>(
+ [this](const VPRecipeBase *R) {
+ return inferScalarType(R->getOperand(0));
+ })
// VPInstructionWithType must be handled before VPInstruction.
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
VPWidenCastRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1a38932ef99fe..459646f0f6a9c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -150,6 +150,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPPredInstPHISC:
+ case VPReverseInterleavePtrSC:
case VPVectorEndPointerSC:
return false;
case VPInstructionSC:
@@ -2379,6 +2380,33 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPReverseInterleavePtrRecipe::execute(VPTransformState &State) {
+ auto &Builder = State.Builder;
+ Value *Ptr = State.get(getPtr(), /*IsScalar*/ true);
+ Value *RuntimeVF = State.get(getVFValue(), /*IsScalar*/ true);
+ Type *IndexTy = Builder.getInt32Ty();
+ if (RuntimeVF->getType() != IndexTy)
+ RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, IndexTy);
+ Value *Index = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
+ Index = Builder.CreateMul(Index, Builder.getInt32(Factor));
+ Index = Builder.CreateNeg(Index);
+ Value *ReversePtr =
+ Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
+
+ State.set(this, ReversePtr, /*IsScalar*/ true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPReverseInterleavePtrRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent;
+ printAsOperand(O, SlotTracker);
+ O << " = reverse-interleave-ptr";
+ printFlags(O);
+ printOperands(O, SlotTracker);
+}
+#endif
+
void VPBlendRecipe::execute(VPTransformState &State) {
assert(isNormalized() && "Expected blend to be normalized!");
// We know that all PHIs in non-header blocks are converted into
@@ -3324,25 +3352,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
if (auto *I = dyn_cast<Instruction>(ResAddr))
State.setDebugLocFrom(I->getDebugLoc());
- // If the group is reverse, adjust the index to refer to the last vector lane
- // instead of the first. We adjust the index from the first vector lane,
- // rather than directly getting the pointer for lane VF - 1, because the
- // pointer operand of the interleaved access is supposed to be uniform.
- if (Group->isReverse()) {
- Value *RuntimeVF =
- getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
- Value *Index =
- State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
- Index = State.Builder.CreateMul(Index,
- State.Builder.getInt32(Group->getFactor()));
- Index = State.Builder.CreateNeg(Index);
-
- bool InBounds = false;
- if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
- InBounds = Gep->isInBounds();
- ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
- }
-
State.setDebugLocFrom(getDebugLoc());
Value *PoisonVec = PoisonValue::get(VecTy);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 730deb0686b2a..d65939728ea7c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2512,6 +2512,21 @@ void VPlanTransforms::createInterleaveGroups(
Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
: B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
}
+ // If the group is reverse, adjust the index to refer to the last vector
+ // lane instead of the first. We adjust the index from the first vector
+ // lane, rather than directly getting the pointer for lane VF - 1, because
+ // the pointer operand of the interleaved access is supposed to be uniform.
+ if (IG->isReverse()) {
+ auto *GEP = dyn_cast<GetElementPtrInst>(
+ getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts());
+ auto *ReversePtr = new VPReverseInterleavePtrRecipe(
+ Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(),
+ GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
+ : GEPNoWrapFlags::none(),
+ InsertPos->getDebugLoc());
+ ReversePtr->insertBefore(InsertPos);
+ Addr = ReversePtr;
+ }
auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc());
VPIG->insertBefore(InsertPos);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index a0d3dc9b934cc..83f6ac223af1e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -335,6 +335,7 @@ class VPDef {
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
+ VPReverseInterleavePtrSC,
VPMulAccumulateReductionSC,
VPExtendedReductionSC,
VPPartialReductionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 8c2958769a615..0031b7579cb60 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -367,8 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 3
+; CHECK-NEXT: [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1
; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
@@ -381,8 +381,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1
; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
@@ -1577,8 +1577,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2
; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
@@ -1597,8 +1597,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
+; CHECK-NEXT: [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2
; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
>From 052c8890a0538c4a2cf9c5d16e11bfa979ec86bc Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 20 Jun 2025 01:44:32 -0700
Subject: [PATCH 02/10] Stride VectorEndPointer for reverse interleaved access
---
.../Transforms/Vectorize/LoopVectorize.cpp | 5 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 11 ++-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 19 +++---
.../Transforms/Vectorize/VPlanTransforms.cpp | 5 +-
.../AArch64/sve-interleaved-accesses.ll | 24 +++----
.../RISCV/riscv-vector-reverse-output.ll | 48 ++++++++-----
.../RISCV/riscv-vector-reverse.ll | 68 ++++++++++---------
...-force-tail-with-evl-reverse-load-store.ll | 21 ++++--
...orize-force-tail-with-evl-uniform-store.ll | 3 +-
9 files changed, 117 insertions(+), 87 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fed1ac5552a1e..b338bd1c3dbcf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7768,8 +7768,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
(CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
? GEPNoWrapFlags::none()
: GEPNoWrapFlags::inBounds();
- VectorPtr = new VPVectorEndPointerRecipe(
- Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
+ VectorPtr =
+ new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
+ /*Stride*/ -1, Flags, I->getDebugLoc());
} else {
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
GEP ? GEP->getNoWrapFlags()
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 604c14a4293bb..85f97f375da9d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1713,12 +1713,16 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<2> {
Type *IndexedTy;
+ int64_t Stride;
+
public:
VPVectorEndPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
- GEPNoWrapFlags GEPFlags, DebugLoc DL)
+ int64_t Stride, GEPNoWrapFlags GEPFlags, DebugLoc DL)
: VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC,
ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
- IndexedTy(IndexedTy) {}
+ IndexedTy(IndexedTy), Stride(Stride) {
+ assert(Stride != 0 && "Unexpected stride");
+ }
VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC)
@@ -1750,7 +1754,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
VPVectorEndPointerRecipe *clone() override {
return new VPVectorEndPointerRecipe(getOperand(0), getVFValue(), IndexedTy,
- getGEPNoWrapFlags(), getDebugLoc());
+ Stride, getGEPNoWrapFlags(),
+ getDebugLoc());
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 459646f0f6a9c..419c9989f6118 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2310,12 +2310,12 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-static Type *getGEPIndexTy(bool IsScalable, bool IsReverse,
+static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
unsigned CurrentPart, IRBuilderBase &Builder) {
// Use i32 for the gep index type when the value is constant,
// or query DataLayout for a more suitable index type otherwise.
const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
- return IsScalable && (IsReverse || CurrentPart > 0)
+ return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
? DL.getIndexType(Builder.getPtrTy(0))
: Builder.getInt32Ty();
}
@@ -2323,18 +2323,21 @@ static Type *getGEPIndexTy(bool IsScalable, bool IsReverse,
void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
unsigned CurrentPart = getUnrollPart(*this);
+ bool IsUnitStride = Stride == 1 || Stride == -1;
Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
- CurrentPart, Builder);
+ IsUnitStride, CurrentPart, Builder);
// The wide store needs to start at the last vector element.
Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
if (IndexTy != RunTimeVF->getType())
RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
- // NumElt = -CurrentPart * RunTimeVF
+ // NumElt = Stride * CurrentPart * RunTimeVF
Value *NumElt = Builder.CreateMul(
- ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
- // LastLane = 1 - RunTimeVF
- Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
+ ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
+ // LastLane = Stride * (RunTimeVF - 1)
+ Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
+ if (Stride != 1)
+ LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
Value *Ptr = State.get(getOperand(0), VPLane(0));
Value *ResultPtr =
Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
@@ -2359,7 +2362,7 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
unsigned CurrentPart = getUnrollPart(*this);
Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
- CurrentPart, Builder);
+ /*IsUnitStride*/ true, CurrentPart, Builder);
Value *Ptr = State.get(getOperand(0), VPLane(0));
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d65939728ea7c..2cf0449286706 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2519,8 +2519,9 @@ void VPlanTransforms::createInterleaveGroups(
if (IG->isReverse()) {
auto *GEP = dyn_cast<GetElementPtrInst>(
getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts());
- auto *ReversePtr = new VPReverseInterleavePtrRecipe(
- Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(),
+ auto *ReversePtr = new VPVectorEndPointerRecipe(
+ Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
+ -(int64_t)IG->getFactor(),
GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
: GEPNoWrapFlags::none(),
InsertPos->getDebugLoc());
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 0031b7579cb60..b349c55d3e09a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -367,10 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1
-; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i64 2, [[TMP6]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
@@ -381,10 +379,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1
-; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = sub nsw i64 2, [[TMP15]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
; CHECK-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
; CHECK-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
@@ -1577,10 +1573,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2
-; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP9:%.*]] = sub nsw i64 4, [[TMP6]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
@@ -1597,10 +1591,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2
-; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
-; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP25:%.*]] = sub nsw i64 4, [[TMP22]]
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
; CHECK-NEXT: [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
; CHECK-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index 09b274de30214..29b27cdb7556d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -40,7 +40,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]]
; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]]
; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
@@ -48,7 +49,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]]
; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]]
; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
@@ -98,7 +100,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]]
+; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1
+; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]]
; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]]
; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
@@ -107,7 +110,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]]
+; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1
+; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]]
; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]]
; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
@@ -157,11 +161,13 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]]
; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]]
; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]]
; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]]
; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
@@ -172,11 +178,13 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
@@ -246,7 +254,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]]
; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
@@ -254,7 +263,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-NEXT: [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]]
+; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]]
; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]])
@@ -304,7 +314,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]]
+; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1
+; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]]
; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]]
; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
@@ -313,7 +324,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]]
+; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1
+; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]]
; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]]
; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]])
@@ -363,11 +375,13 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]]
; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]]
; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]]
; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]]
; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
@@ -378,11 +392,13 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP19]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index dd8b7d6ea7e42..b4e49a60e0887 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -334,22 +334,24 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %22 = zext i32 %21 to i64
; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22
; CHECK-NEXT: %24 = mul i64 0, %18
-; CHECK-NEXT: %25 = sub i64 1, %18
-; CHECK-NEXT: %26 = getelementptr inbounds i32, ptr %23, i64 %24
-; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %26, i64 %25
-; CHECK-NEXT: %wide.load = load <vscale x 4 x i32>, ptr %27, align 4
+; CHECK-NEXT: %25 = sub i64 %18, 1
+; CHECK-NEXT: %26 = mul i64 -1, %25
+; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %23, i64 %24
+; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %27, i64 %26
+; CHECK-NEXT: %wide.load = load <vscale x 4 x i32>, ptr %28, align 4
; CHECK-NEXT: %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %wide.load)
-; CHECK-NEXT: %28 = add <vscale x 4 x i32> %reverse, splat (i32 1)
-; CHECK-NEXT: %29 = getelementptr inbounds i32, ptr %A, i64 %22
-; CHECK-NEXT: %30 = mul i64 0, %18
-; CHECK-NEXT: %31 = sub i64 1, %18
-; CHECK-NEXT: %32 = getelementptr inbounds i32, ptr %29, i64 %30
-; CHECK-NEXT: %33 = getelementptr inbounds i32, ptr %32, i64 %31
-; CHECK-NEXT: %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %28)
-; CHECK-NEXT: store <vscale x 4 x i32> %reverse4, ptr %33, align 4
+; CHECK-NEXT: %29 = add <vscale x 4 x i32> %reverse, splat (i32 1)
+; CHECK-NEXT: %30 = getelementptr inbounds i32, ptr %A, i64 %22
+; CHECK-NEXT: %31 = mul i64 0, %18
+; CHECK-NEXT: %32 = sub i64 %18, 1
+; CHECK-NEXT: %33 = mul i64 -1, %32
+; CHECK-NEXT: %34 = getelementptr inbounds i32, ptr %30, i64 %31
+; CHECK-NEXT: %35 = getelementptr inbounds i32, ptr %34, i64 %33
+; CHECK-NEXT: %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %29)
+; CHECK-NEXT: store <vscale x 4 x i32> %reverse4, ptr %35, align 4
; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %34 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %34, <null operand!>, label %vector.body
+; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1 %36, <null operand!>, label %vector.body
; CHECK-NEXT: LV: created middle.block
; CHECK-NEXT: LV: draw edge from vector.body
; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block
@@ -380,8 +382,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: %35 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT: %add9 = add i32 %35, 1
+; CHECK-NEXT: %37 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: %add9 = add i32 %37, 1
; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4
; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
@@ -743,22 +745,24 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %22 = zext i32 %21 to i64
; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22
; CHECK-NEXT: %24 = mul i64 0, %18
-; CHECK-NEXT: %25 = sub i64 1, %18
-; CHECK-NEXT: %26 = getelementptr inbounds float, ptr %23, i64 %24
-; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %26, i64 %25
-; CHECK-NEXT: %wide.load = load <vscale x 4 x float>, ptr %27, align 4
+; CHECK-NEXT: %25 = sub i64 %18, 1
+; CHECK-NEXT: %26 = mul i64 -1, %25
+; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %23, i64 %24
+; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %27, i64 %26
+; CHECK-NEXT: %wide.load = load <vscale x 4 x float>, ptr %28, align 4
; CHECK-NEXT: %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %wide.load)
-; CHECK-NEXT: %28 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
-; CHECK-NEXT: %29 = getelementptr inbounds float, ptr %A, i64 %22
-; CHECK-NEXT: %30 = mul i64 0, %18
-; CHECK-NEXT: %31 = sub i64 1, %18
-; CHECK-NEXT: %32 = getelementptr inbounds float, ptr %29, i64 %30
-; CHECK-NEXT: %33 = getelementptr inbounds float, ptr %32, i64 %31
-; CHECK-NEXT: %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %28)
-; CHECK-NEXT: store <vscale x 4 x float> %reverse4, ptr %33, align 4
+; CHECK-NEXT: %29 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
+; CHECK-NEXT: %30 = getelementptr inbounds float, ptr %A, i64 %22
+; CHECK-NEXT: %31 = mul i64 0, %18
+; CHECK-NEXT: %32 = sub i64 %18, 1
+; CHECK-NEXT: %33 = mul i64 -1, %32
+; CHECK-NEXT: %34 = getelementptr inbounds float, ptr %30, i64 %31
+; CHECK-NEXT: %35 = getelementptr inbounds float, ptr %34, i64 %33
+; CHECK-NEXT: %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %29)
+; CHECK-NEXT: store <vscale x 4 x float> %reverse4, ptr %35, align 4
; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %34 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %34, <null operand!>, label %vector.body
+; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1 %36, <null operand!>, label %vector.body
; CHECK-NEXT: LV: created middle.block
; CHECK-NEXT: LV: draw edge from vector.body
; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block
@@ -789,8 +793,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: %35 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT: %conv1 = fadd float %35, 1.000000e+00
+; CHECK-NEXT: %37 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT: %conv1 = fadd float %37, 1.000000e+00
; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4
; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 96db5bf4e9acc..91d94e52d0990 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -33,7 +33,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]]
-; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[TMP18]], 1
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 -1, [[TMP11]]
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -41,7 +42,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
-; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]]
+; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 [[TMP19]], 1
+; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP23]]
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -136,7 +138,8 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 [[TMP26]], 1
+; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP15]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -145,7 +148,8 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
-; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]]
+; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP27]], 1
+; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP30]]
; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
@@ -261,7 +265,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
; IF-EVL-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP9]]
-; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]]
+; IF-EVL-NEXT: [[TMP29:%.*]] = sub i64 [[TMP9]], 1
+; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP29]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
@@ -271,7 +276,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]]
; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]]
+; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP16]], 1
+; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP30]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
@@ -279,7 +285,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[OFFSET_IDX]]
; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]]
-; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]]
+; IF-EVL-NEXT: [[TMP31:%.*]] = sub i64 [[TMP22]], 1
+; IF-EVL-NEXT: [[TMP24:%.*]] = mul i64 -1, [[TMP31]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 5c94ce180578f..984b64c55ce16 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -34,7 +34,8 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP15]]
+; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[TMP15]], 1
+; CHECK-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP16]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]]
; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
>From 9840a7057f255359beca01e192b68d113f76499f Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 24 Jun 2025 02:20:10 -0700
Subject: [PATCH 03/10] Remove VPReverseInterleavePtrRecipe
---
.../Transforms/Vectorize/LoopVectorize.cpp | 1 -
llvm/lib/Transforms/Vectorize/VPlan.h | 49 -------------------
.../Transforms/Vectorize/VPlanAnalysis.cpp | 7 ++-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 28 -----------
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 -
5 files changed, 3 insertions(+), 83 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b338bd1c3dbcf..ea3ea184fcffb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4257,7 +4257,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPDerivedIVSC:
case VPDef::VPScalarIVStepsSC:
case VPDef::VPReplicateSC:
- case VPDef::VPReverseInterleavePtrSC:
case VPDef::VPInstructionSC:
case VPDef::VPCanonicalIVPHISC:
case VPDef::VPVectorPointerSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 85f97f375da9d..eb11e5c4e5935 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -531,7 +531,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
- case VPRecipeBase::VPReverseInterleavePtrSC:
case VPRecipeBase::VPMulAccumulateReductionSC:
case VPRecipeBase::VPExtendedReductionSC:
case VPRecipeBase::VPReplicateSC:
@@ -852,7 +851,6 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
R->getVPDefID() == VPRecipeBase::VPReductionSC ||
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
- R->getVPDefID() == VPRecipeBase::VPReverseInterleavePtrSC ||
R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
@@ -1814,53 +1812,6 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
#endif
};
-class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags {
- Type *IndexedTy;
- unsigned Factor;
-
-public:
- VPReverseInterleavePtrRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
- unsigned Factor, GEPNoWrapFlags GEPFlags,
- DebugLoc DL)
- : VPRecipeWithIRFlags(VPDef::VPReverseInterleavePtrSC,
- ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
- IndexedTy(IndexedTy), Factor(Factor) {
- assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
- }
-
- VP_CLASSOF_IMPL(VPDef::VPReverseInterleavePtrSC)
-
- VPValue *getPtr() const { return getOperand(0); }
-
- VPValue *getVFValue() const { return getOperand(1); }
-
- void execute(VPTransformState &State) override;
-
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
- assert(is_contained(operands(), Op) &&
- "Op must be an operand of the recipe");
- return true;
- }
-
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override {
- // TODO: Compute accurate cost after retiring the legacy cost model.
- return 0;
- }
-
- VPReverseInterleavePtrRecipe *clone() override {
- return new VPReverseInterleavePtrRecipe(getPtr(), getVFValue(), IndexedTy,
- Factor, getGEPNoWrapFlags(),
- getDebugLoc());
- }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-#endif
-};
-
/// A pure virtual base class for all recipes modeling header phis, including
/// phis for first order recurrences, pointer inductions and reductions. The
/// start value is the first operand of the recipe and the incoming value from
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index c45bc743c9fc7..f3b99fe34c069 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -283,10 +283,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
- VPPartialReductionRecipe, VPReverseInterleavePtrRecipe>(
- [this](const VPRecipeBase *R) {
- return inferScalarType(R->getOperand(0));
- })
+ VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
+ return inferScalarType(R->getOperand(0));
+ })
// VPInstructionWithType must be handled before VPInstruction.
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
VPWidenCastRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 419c9989f6118..d62ff2a7112d2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -150,7 +150,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPPredInstPHISC:
- case VPReverseInterleavePtrSC:
case VPVectorEndPointerSC:
return false;
case VPInstructionSC:
@@ -2383,33 +2382,6 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-void VPReverseInterleavePtrRecipe::execute(VPTransformState &State) {
- auto &Builder = State.Builder;
- Value *Ptr = State.get(getPtr(), /*IsScalar*/ true);
- Value *RuntimeVF = State.get(getVFValue(), /*IsScalar*/ true);
- Type *IndexTy = Builder.getInt32Ty();
- if (RuntimeVF->getType() != IndexTy)
- RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, IndexTy);
- Value *Index = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
- Index = Builder.CreateMul(Index, Builder.getInt32(Factor));
- Index = Builder.CreateNeg(Index);
- Value *ReversePtr =
- Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
-
- State.set(this, ReversePtr, /*IsScalar*/ true);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPReverseInterleavePtrRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent;
- printAsOperand(O, SlotTracker);
- O << " = reverse-interleave-ptr";
- printFlags(O);
- printOperands(O, SlotTracker);
-}
-#endif
-
void VPBlendRecipe::execute(VPTransformState &State) {
assert(isNormalized() && "Expected blend to be normalized!");
// We know that all PHIs in non-header blocks are converted into
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 83f6ac223af1e..a0d3dc9b934cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -335,7 +335,6 @@ class VPDef {
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
- VPReverseInterleavePtrSC,
VPMulAccumulateReductionSC,
VPExtendedReductionSC,
VPPartialReductionSC,
>From af2f6a70ffc6a1b67c53d0bc2f75b6020c106922 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 24 Jun 2025 02:22:38 -0700
Subject: [PATCH 04/10] comment
---
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index eb11e5c4e5935..d47c40e255e72 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1706,7 +1706,7 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
/// A recipe to compute a pointer to the last element of each part of a widened
/// memory access for widened memory accesses of IndexedTy. Used for
-/// VPWidenMemoryRecipes that are reversed.
+/// VPWidenMemoryRecipes or VPInterleaveRecipes that are reversed.
class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<2> {
Type *IndexedTy;
>From d115d9ce086e6a95b7dc7a5a8d1d199279bbaafb Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 30 Jun 2025 00:45:54 -0700
Subject: [PATCH 05/10] Refine assertion
---
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d47c40e255e72..d7f2fcc2f196f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1719,7 +1719,7 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
: VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC,
ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
IndexedTy(IndexedTy), Stride(Stride) {
- assert(Stride != 0 && "Unexpected stride");
+ assert(Stride != 0 && "Stride cannot be zero");
}
VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC)
>From fb4c54532999b43c0b2aa2d4ada20f4b723f23c4 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 30 Jun 2025 01:34:37 -0700
Subject: [PATCH 06/10] Add comment for stride
---
llvm/lib/Transforms/Vectorize/VPlan.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d7f2fcc2f196f..b7ec259ff24f6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1711,6 +1711,7 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<2> {
Type *IndexedTy;
+ /// The constant stride of the pointer computed by this recipe.
int64_t Stride;
public:
>From 6e68e03a540c4eceec1a8f0443052f7184c54240 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 30 Jun 2025 01:57:15 -0700
Subject: [PATCH 07/10] Reuse InBounds
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2cf0449286706..c60b21a91dac0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2482,23 +2482,23 @@ void VPlanTransforms::createInterleaveGroups(
auto *InsertPos =
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
+ bool InBounds = false;
+ if (auto *Gep = dyn_cast<GetElementPtrInst>(
+ getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
+ InBounds = Gep->isInBounds();
+
// Get or create the start address for the interleave group.
auto *Start =
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
VPValue *Addr = Start->getAddr();
VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
- // TODO: Hoist Addr's defining recipe (and any operands as needed) to
- // InsertPos or sink loads above zero members to join it.
- bool InBounds = false;
- if (auto *Gep = dyn_cast<GetElementPtrInst>(
- getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
- InBounds = Gep->isInBounds();
-
// We cannot re-use the address of member zero because it does not
// dominate the insert position. Instead, use the address of the insert
// position and create a PtrAdd adjusting it to the address of member
// zero.
+ // TODO: Hoist Addr's defining recipe (and any operands as needed) to
+ // InsertPos or sink loads above zero members to join it.
assert(IG->getIndex(IRInsertPos) != 0 &&
"index of insert position shouldn't be zero");
auto &DL = IRInsertPos->getDataLayout();
@@ -2522,8 +2522,7 @@ void VPlanTransforms::createInterleaveGroups(
auto *ReversePtr = new VPVectorEndPointerRecipe(
Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
-(int64_t)IG->getFactor(),
- GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
- : GEPNoWrapFlags::none(),
+ InBounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none(),
InsertPos->getDebugLoc());
ReversePtr->insertBefore(InsertPos);
Addr = ReversePtr;
>From ef6b46a55b9517f9de79b42424f6257cff83638f Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 30 Jun 2025 23:23:31 -0700
Subject: [PATCH 08/10] assert negative stride
---
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index b7ec259ff24f6..fce146acdc247 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1720,7 +1720,7 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
: VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC,
ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
IndexedTy(IndexedTy), Stride(Stride) {
- assert(Stride != 0 && "Stride cannot be zero");
+ assert(Stride < 0 && "Stride must be negative");
}
VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC)
>From 736b5a3d7294fcaae0628b41f6770f3b33dd98af Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 30 Jun 2025 23:26:43 -0700
Subject: [PATCH 09/10] Remove dead GEP
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c60b21a91dac0..5f2884c178cbe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2517,8 +2517,6 @@ void VPlanTransforms::createInterleaveGroups(
// lane, rather than directly getting the pointer for lane VF - 1, because
// the pointer operand of the interleaved access is supposed to be uniform.
if (IG->isReverse()) {
- auto *GEP = dyn_cast<GetElementPtrInst>(
- getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts());
auto *ReversePtr = new VPVectorEndPointerRecipe(
Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
-(int64_t)IG->getFactor(),
>From 5a7c4bf6cf35123f17749974200a9662ee707561 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 30 Jun 2025 23:52:03 -0700
Subject: [PATCH 10/10] Refine comment
---
llvm/lib/Transforms/Vectorize/VPlan.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fce146acdc247..b3c54f0e3e2f3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1711,7 +1711,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<2> {
Type *IndexedTy;
- /// The constant stride of the pointer computed by this recipe.
+ /// The constant stride of the pointer computed by this recipe, expressed in
+ /// units of IndexedTy.
int64_t Stride;
public:
More information about the llvm-commits
mailing list