[llvm] [WIP][LV][VPlan] For reverse iterative continuous load and store access, use unit load/store (PR #130032)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 23 01:25:12 PDT 2025
https://github.com/LiqinWeng updated https://github.com/llvm/llvm-project/pull/130032
>From 6f5f6c2fe1f3e5c4ca6893d07042e8f0f841459b Mon Sep 17 00:00:00 2001
From: "Liqin.Weng" <liqin.weng at spacemit.com>
Date: Mon, 23 Jun 2025 16:24:08 +0800
Subject: [PATCH] [LV][VPlan] When the load/store stride is -1, use vle/vse
instead of vlse/vsse
---
.../Transforms/Vectorize/LoopVectorize.cpp | 9 ++-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 78 +++++++++----------
...-force-tail-with-evl-reverse-load-store.ll | 77 +++++++-----------
...orize-force-tail-with-evl-uniform-store.ll | 11 +--
4 files changed, 75 insertions(+), 100 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f887b34e76422..8267a29cb11c3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7061,6 +7061,13 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
RepR->getUnderlyingInstr(), VF))
return true;
}
+
+ // The VPlan-based cost model may calculate the cost of strided load/store
+ // which can't be modeled in the legacy cost model.
+ if (isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R))
+ if (cast<VPWidenMemoryRecipe>(&R)->isReverse())
+ return true;
+
if (Instruction *UI = GetInstructionForCost(&R)) {
// If we adjusted the predicate of the recipe, the cost in the legacy
// cost model may be different.
@@ -7758,7 +7765,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
VPSingleDefRecipe *VectorPtr;
- if (Reverse) {
+ if (Reverse && !CM.foldTailWithEVL()) {
// When folding the tail, we may compute an address that we don't in the
// original scalar loop and it may not be inbounds. Drop Inbounds in that
// case.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1ed0b97849a8d..f18a080124a7a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2918,17 +2918,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-/// Use all-true mask for reverse rather than actual mask, as it avoids a
-/// dependence w/o affecting the result.
-static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
- Value *EVL, const Twine &Name) {
- VectorType *ValTy = cast<VectorType>(Operand->getType());
- Value *AllTrueMask =
- Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
- return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
- {Operand, AllTrueMask, EVL}, nullptr, Name);
-}
-
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
@@ -2940,29 +2929,33 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Value *EVL = State.get(getEVL(), VPLane(0));
Value *Addr = State.get(getAddr(), !CreateGather);
Value *Mask = nullptr;
- if (VPValue *VPMask = getMask()) {
+ if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
- if (isReverse())
- Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
- } else {
+ else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
- }
if (CreateGather) {
NewLI =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
} else {
+ if (isReverse()) {
+ auto *EltTy = DataTy->getElementType();
+ // if (EltTy->getScalarSizeInBits() !=
+ // EVL->getType()->getScalarSizeInBits())
+ // EVL = ConstantInt::getSigned(EVL->getType(),
+ // static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
+ auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
+ Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
+ Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
+ }
NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
{Addr, Mask, EVL}, nullptr, "vp.op.load");
}
NewLI->addParamAttr(
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
applyMetadata(*NewLI);
- Instruction *Res = NewLI;
- if (isReverse())
- Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
- State.set(this, Res);
+ State.set(this, NewLI);
}
InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
@@ -2980,14 +2973,8 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
- InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
- Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
- if (!Reverse)
- return Cost;
-
- return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- cast<VectorType>(Ty), {}, Ctx.CostKind,
- 0);
+ return Ctx.TTI.getMaskedMemoryOpCost(Instruction::Load, Ty, Alignment, AS,
+ Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3044,6 +3031,8 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
VPValue *StoredValue = getStoredValue();
bool CreateScatter = !isConsecutive();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
@@ -3053,22 +3042,32 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
- if (isReverse())
- StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
Value *Mask = nullptr;
- if (VPValue *VPMask = getMask()) {
+ if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
- if (isReverse())
- Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
- } else {
+ else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
- }
+
Value *Addr = State.get(getAddr(), !CreateScatter);
if (CreateScatter) {
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
+ if (isReverse()) {
+ auto *EltTy = DataTy->getElementType();
+ // FIXME: we may need not deal with the size, the InstCombine will deal
+ // with the Offset Type if (EltTy->getScalarSizeInBits() !=
+ // EVL->getType()->getScalarSizeInBits())
+ // EVL = ConstantInt::getSigned(EVL->getType(),
+ // static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
+ auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
+ // Value *Offset =
+ // Builder.CreateSub(State.Builder.getIntN(EVL->getType()->getScalarSizeInBits(),
+ // 1), EVL);
+ Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
+ Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
+ }
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
Intrinsic::vp_store,
{StoredVal, Addr, Mask, EVL});
@@ -3093,14 +3092,9 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
- InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
- Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
- if (!Reverse)
- return Cost;
- return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- cast<VectorType>(Ty), {}, Ctx.CostKind,
- 0);
+ return Ctx.TTI.getMaskedMemoryOpCost(Instruction::Store, Ty, Alignment, AS,
+ Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 4d8166eaa46f1..ed389bea3ead5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -31,21 +31,15 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]]
-; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
-; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i32 1, [[TMP5]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 [[TMP9]]
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
-; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]]
-; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
-; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
-; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[TMP16:%.*]] = sub i32 1, [[TMP5]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 [[TMP16]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP_LOAD]], ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -134,23 +128,15 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
-; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP5]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 [[TMP19]]
+; IF-EVL-NEXT: [[VP_OP_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP14]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
-; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]]
-; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
-; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
+; IF-EVL-NEXT: [[TMP22:%.*]] = sub i32 1, [[TMP5]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP15]], i32 [[TMP22]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP_LOAD2]], ptr align 4 [[TMP17]], <vscale x 4 x i1> [[TMP14]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -259,31 +245,22 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
-; IF-EVL-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP9]]
-; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]]
-; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
-; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i32 1, [[TMP6]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i32 [[TMP9]]
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_OP_LOAD]]
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
-; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
+; IF-EVL-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP6]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP12]], i32 [[TMP19]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[OFFSET_IDX]]
-; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]]
-; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]]
-; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
-; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
-; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE2]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP21]], i32 0
+; IF-EVL-NEXT: [[TMP17:%.*]] = sub i32 1, [[TMP6]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP16]], i32 [[TMP17]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP18]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 82e8d3d6c611a..fc10b8d093967 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -36,13 +36,10 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP12:%.*]] = sub nuw nsw i64 1, [[OFFSET_IDX]]
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
-; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP15]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP16]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]]
-; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP19]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i32 0
+; CHECK-NEXT: [[TMP14:%.*]] = sub i32 1, [[TMP11]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[TMP13]], i32 [[TMP14]]
+; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
More information about the llvm-commits
mailing list