[llvm] [VPlan] Implement VPWidenLoad/StoreEVLRecipe::computeCost(). (PR #109644)

Wed Sep 25 07:00:12 PDT 2024

https://github.com/ElvisWang123 updated https://github.com/llvm/llvm-project/pull/109644

>From 5be3a79ccb25a87f581ba8bce86d9f2ad0df6a42 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Mon, 23 Sep 2024 02:57:58 -0700
Subject: [PATCH 1/4] [LV][VPlan] Not adding shuffle cost when store loop
 invariant value.

This patch makes VPWidenMemoryRecipe and the legacy model not adding
reverse shuffle cost when the stored value is loop invariant.

This patch also fixes #109468.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   5 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   4 +-
 ...orize-force-tail-with-evl-uniform-store.ll | 160 ++++++++++++++++++
 3 files changed, 167 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 806dbd5bb7304e..ec43cafc685a9e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5666,7 +5666,10 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
   }
 
   bool Reverse = ConsecutiveStride < 0;
-  if (Reverse)
+  const StoreInst *SI = dyn_cast<StoreInst>(I);
+  bool IsLoopInvariantStoreValue =
+      SI && Legal->isInvariant(const_cast<StoreInst *>(SI)->getValueOperand());
+  if (Reverse && !IsLoopInvariantStoreValue)
     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
                                CostKind, 0);
   return Cost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9a2cfbc35cb84f..8bd67109403d35 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2253,7 +2253,9 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
                                     CostKind, OpInfo, &Ingredient);
   }
-  if (!Reverse)
+  // If the store value is a live-in scalar value which is uniform, we don't
+  // need to calculate the reverse cost.
+  if (!Reverse || (isa<StoreInst>(Ingredient) && getOperand(1)->isLiveIn()))
     return Cost;
 
   return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
new file mode 100644
index 00000000000000..7fd9b50a53836d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -S| FileCheck %s
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl -S| FileCheck %s --check-prefixes=EVL
+; COM: From issue #109468
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+define void @lshift_significand(i32 %n, ptr nocapture writeonly %0) local_unnamed_addr #0 {
+; CHECK-LABEL: define void @lshift_significand(
+; CHECK-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 3, [[SPEC_SELECT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP8]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP12]]
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x i64> [[VEC_IV]], i32 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP14]], i64 [[TMP1]])
+; CHECK-NEXT:    [[IDXPROM12:%.*]] = sub nuw nsw i64 1, [[TMP11]]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr [3 x i64], ptr [[TMP0]], i64 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 0, [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = sub i64 1, [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i64, ptr [[TMP21]], i64 [[TMP20]]
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[REVERSE1]], ptr [[TMP22]], i32 8, <vscale x 2 x i1> [[REVERSE]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[FOR_END16:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY9:.*]]
+; CHECK:       [[FOR_BODY9]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY9]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 1, [[INDVARS_IV]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr [3 x i64], ptr [[TMP0]], i64 0, i64 [[TMP24]]
+; CHECK-NEXT:    store i64 0, ptr [[ARRAYIDX14]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 3
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END16]], label %[[FOR_BODY9]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[FOR_END16]]:
+; CHECK-NEXT:    ret void
+;
+; EVL-LABEL: define void @lshift_significand(
+; EVL-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; EVL-NEXT:  [[ENTRY:.*]]:
+; EVL-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
+; EVL-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
+; EVL-NEXT:    [[TMP1:%.*]] = sub i64 3, [[SPEC_SELECT]]
+; EVL-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[TMP1]]
+; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; EVL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; EVL-NEXT:    br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; EVL:       [[VECTOR_PH]]:
+; EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP8]]
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; EVL-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
+; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; EVL:       [[VECTOR_BODY]]:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP1]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[SUB11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 2, i1 true)
+; EVL-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; EVL-NEXT:    [[TMP14:%.*]] = sub nuw nsw i64 1, [[TMP13]]
+; EVL-NEXT:    [[TMP15:%.*]] = getelementptr [3 x i64], ptr [[TMP0]], i64 0, i64 [[TMP14]]
+; EVL-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; EVL-NEXT:    [[TMP18:%.*]] = mul i64 0, [[TMP17]]
+; EVL-NEXT:    [[TMP19:%.*]] = sub i64 1, [[TMP17]]
+; EVL-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP15]], i64 [[TMP18]]
+; EVL-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[TMP20]], i64 [[TMP19]]
+; EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[SUB11]])
+; EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP21]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[SUB11]])
+; EVL-NEXT:    [[IDXPROM12:%.*]] = zext i32 [[SUB11]] to i64
+; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[IDXPROM12]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL:       [[MIDDLE_BLOCK]]:
+; EVL-NEXT:    br i1 true, label %[[FOR_END16:.*]], label %[[SCALAR_PH]]
+; EVL:       [[SCALAR_PH]]:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
+; EVL-NEXT:    br label %[[FOR_BODY9:.*]]
+; EVL:       [[FOR_BODY9]]:
+; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY9]] ]
+; EVL-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 1, [[INDVARS_IV]]
+; EVL-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr [3 x i64], ptr [[TMP0]], i64 0, i64 [[TMP24]]
+; EVL-NEXT:    store i64 0, ptr [[ARRAYIDX13]], align 8
+; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 3
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END16]], label %[[FOR_BODY9]], !llvm.loop [[LOOP3:![0-9]+]]
+; EVL:       [[FOR_END16]]:
+; EVL-NEXT:    ret void
+;
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write)
+entry:
+  %cmp1.peel = icmp eq i32 %n, 0
+  %spec.select = select i1 %cmp1.peel, i64 2, i64 0
+  br label %for.body9
+
+for.body9:                                        ; preds = %entry, %for.body9
+  %indvars.iv = phi i64 [ %spec.select, %entry ], [ %indvars.iv.next, %for.body9 ]
+  %1 = sub nuw nsw i64 1, %indvars.iv
+  %arrayidx13 = getelementptr [3 x i64], ptr %0, i64 0, i64 %1
+  store i64 0, ptr %arrayidx13, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 3
+  br i1 %exitcond.not, label %for.end16, label %for.body9
+
+for.end16:                                        ; preds = %for.body9
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

>From b6309955e56539bad0f1a81517a1c336493eae21 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Mon, 23 Sep 2024 09:00:44 -0700
Subject: [PATCH 2/4] [VPlan] Implement VPWidenStoreEVLRecipe::computeCost().

Currently the EVL recipes transfer the tail masking to the EVL.
But in the legacy cost model, the mask exist and will calculate the
instruction cost of the mask.
To fix the difference between the VPlan-based cost model and the legacy
cost model, we always calculate the instruction cost of the mask in the EVL recipes.

Note that we should remove the mask cost in the EVL recipes when we
don't need to compare to the legacy cost model.

This patch also fix #109468.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   5 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   4 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  41 ++++-
 ...orize-force-tail-with-evl-uniform-store.ll | 143 +-----------------
 4 files changed, 47 insertions(+), 146 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ec43cafc685a9e..806dbd5bb7304e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5666,10 +5666,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
   }
 
   bool Reverse = ConsecutiveStride < 0;
-  const StoreInst *SI = dyn_cast<StoreInst>(I);
-  bool IsLoopInvariantStoreValue =
-      SI && Legal->isInvariant(const_cast<StoreInst *>(SI)->getValueOperand());
-  if (Reverse && !IsLoopInvariantStoreValue)
+  if (Reverse)
     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
                                CostKind, 0);
   return Cost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 73d218cdc7ac27..14a867f505e78e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2787,6 +2787,10 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   /// Generate the wide store or scatter.
   void execute(VPTransformState &State) override;
 
+  /// Return the cost of this VPWidenStoreEVLRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8bd67109403d35..2ae9739d4773cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2253,9 +2253,7 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
                                     CostKind, OpInfo, &Ingredient);
   }
-  // If the store value is a live-in scalar value which is uniform, we don't
-  // need to calculate the reverse cost.
-  if (!Reverse || (isa<StoreInst>(Ingredient) && getOperand(1)->isLiveIn()))
+  if (!Reverse)
     return Cost;
 
   return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
@@ -2466,6 +2464,43 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   State.addMetadata(NewSI, SI);
 }
 
+InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
+                                                   VPCostContext &Ctx) const {
+  Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+  const Align Alignment =
+      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
+  unsigned AS =
+      getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  if (!Consecutive) {
+    // TODO: Using the original IR may not be accurate.
+    // Currently, ARM will use the underlying IR to calculate gather/scatter
+    // instruction cost.
+    const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+    assert(!Reverse &&
+           "Inconsecutive memory access should not have the order.");
+    return Ctx.TTI.getAddressComputationCost(Ty) +
+           Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
+                                          IsMasked, Alignment, CostKind,
+                                          &Ingredient);
+  }
+
+  InstructionCost Cost = 0;
+  // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
+  // here because the EVL recipes using EVL to replace the tail mask. But in the
+  // legacy model, it will always calculate the cost of mask.
+  // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
+  // don't need to care the legacy cost model.
+  Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+                                        AS, CostKind);
+  if (!Reverse)
+    return Cost;
+
+  return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                        cast<VectorType>(Ty), {}, CostKind, 0);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 7fd9b50a53836d..18cf497068a224 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -1,135 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -S| FileCheck %s
-; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl -S| FileCheck %s --check-prefixes=EVL
-; COM: From issue #109468
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl
+; Generated from issue #109468.
+
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
 target triple = "riscv64-unknown-linux-gnu"
 
 define void @lshift_significand(i32 %n, ptr nocapture writeonly %0) local_unnamed_addr #0 {
-; CHECK-LABEL: define void @lshift_significand(
-; CHECK-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 3, [[SPEC_SELECT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP8]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP12]]
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x i64> [[VEC_IV]], i32 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP14]], i64 [[TMP1]])
-; CHECK-NEXT:    [[IDXPROM12:%.*]] = sub nuw nsw i64 1, [[TMP11]]
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr [3 x i64], ptr [[TMP0]], i64 0, i64 [[IDXPROM12]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
-; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 0, [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = sub i64 1, [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i64, ptr [[TMP21]], i64 [[TMP20]]
-; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[REVERSE1]], ptr [[TMP22]], i32 8, <vscale x 2 x i1> [[REVERSE]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[FOR_END16:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[FOR_BODY9:.*]]
-; CHECK:       [[FOR_BODY9]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY9]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 1, [[INDVARS_IV]]
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr [3 x i64], ptr [[TMP0]], i64 0, i64 [[TMP24]]
-; CHECK-NEXT:    store i64 0, ptr [[ARRAYIDX14]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END16]], label %[[FOR_BODY9]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       [[FOR_END16]]:
-; CHECK-NEXT:    ret void
-;
-; EVL-LABEL: define void @lshift_significand(
-; EVL-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; EVL-NEXT:  [[ENTRY:.*]]:
-; EVL-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
-; EVL-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
-; EVL-NEXT:    [[TMP1:%.*]] = sub i64 3, [[SPEC_SELECT]]
-; EVL-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[TMP1]]
-; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; EVL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
-; EVL-NEXT:    br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; EVL:       [[VECTOR_PH]]:
-; EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
-; EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
-; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP8]]
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; EVL-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
-; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
-; EVL:       [[VECTOR_BODY]]:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP1]], [[EVL_BASED_IV]]
-; EVL-NEXT:    [[SUB11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 2, i1 true)
-; EVL-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
-; EVL-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
-; EVL-NEXT:    [[TMP14:%.*]] = sub nuw nsw i64 1, [[TMP13]]
-; EVL-NEXT:    [[TMP15:%.*]] = getelementptr [3 x i64], ptr [[TMP0]], i64 0, i64 [[TMP14]]
-; EVL-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
-; EVL-NEXT:    [[TMP18:%.*]] = mul i64 0, [[TMP17]]
-; EVL-NEXT:    [[TMP19:%.*]] = sub i64 1, [[TMP17]]
-; EVL-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP15]], i64 [[TMP18]]
-; EVL-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[TMP20]], i64 [[TMP19]]
-; EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[SUB11]])
-; EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP21]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[SUB11]])
-; EVL-NEXT:    [[IDXPROM12:%.*]] = zext i32 [[SUB11]] to i64
-; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[IDXPROM12]], [[EVL_BASED_IV]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
-; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; EVL:       [[MIDDLE_BLOCK]]:
-; EVL-NEXT:    br i1 true, label %[[FOR_END16:.*]], label %[[SCALAR_PH]]
-; EVL:       [[SCALAR_PH]]:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
-; EVL-NEXT:    br label %[[FOR_BODY9:.*]]
-; EVL:       [[FOR_BODY9]]:
-; EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY9]] ]
-; EVL-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 1, [[INDVARS_IV]]
-; EVL-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr [3 x i64], ptr [[TMP0]], i64 0, i64 [[TMP24]]
-; EVL-NEXT:    store i64 0, ptr [[ARRAYIDX13]], align 8
-; EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 3
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END16]], label %[[FOR_BODY9]], !llvm.loop [[LOOP3:![0-9]+]]
-; EVL:       [[FOR_END16]]:
-; EVL-NEXT:    ret void
-;
-; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write)
 entry:
   %cmp1.peel = icmp eq i32 %n, 0
   %spec.select = select i1 %cmp1.peel, i64 2, i64 0
@@ -147,14 +23,3 @@ for.body9:                                        ; preds = %entry, %for.body9
 for.end16:                                        ; preds = %for.body9
   ret void
 }
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-;.
-; EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-;.

>From d1af59eea7e9fb6a554803295f01680ea6e12e34 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Tue, 24 Sep 2024 01:18:12 -0700
Subject: [PATCH 3/4] Address comments

Reuse VPWidenMemoryRecipe::computeCost().
Implement VPWidenLoadEVLRecipe::computeCost().
Rename basic blocks in the testcase.
---
 llvm/lib/Transforms/Vectorize/VPlan.h         |   4 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  63 +++++---
 ...orize-force-tail-with-evl-uniform-store.ll | 143 ++++++++++++++++--
 3 files changed, 174 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 14a867f505e78e..79b555735396df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2709,6 +2709,10 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   /// Generate the wide load or gather.
   void execute(VPTransformState &State) override;
 
+  /// Return the cost of this VPWidenLoadEVLRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2ae9739d4773cb..dc6e47bd90ab2f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2364,6 +2364,32 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   State.set(this, Res, 0);
 }
 
+InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
+                                                  VPCostContext &Ctx) const {
+  if (!Consecutive || IsMasked) {
+    return VPWidenMemoryRecipe::computeCost(VF, Ctx);
+  }
+
+  // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
+  // here because the EVL recipes using EVL to replace the tail mask. But in the
+  // legacy model, it will always calculate the cost of mask.
+  // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
+  // don't need to compare to the legacy cost model.
+  Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+  const Align Alignment =
+      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
+  unsigned AS =
+      getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
+      Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
+  if (!Reverse)
+    return Cost;
+
+  return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                       cast<VectorType>(Ty), {}, CostKind, 0);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                  VPSlotTracker &SlotTracker) const {
@@ -2466,39 +2492,28 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
 
 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
                                                    VPCostContext &Ctx) const {
-  Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
-  const Align Alignment =
-      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
-  unsigned AS =
-      getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
-  if (!Consecutive) {
-    // TODO: Using the original IR may not be accurate.
-    // Currently, ARM will use the underlying IR to calculate gather/scatter
-    // instruction cost.
-    const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
-    assert(!Reverse &&
-           "Inconsecutive memory access should not have the order.");
-    return Ctx.TTI.getAddressComputationCost(Ty) +
-           Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
-                                          IsMasked, Alignment, CostKind,
-                                          &Ingredient);
+  if (!Consecutive || IsMasked) {
+    return VPWidenMemoryRecipe::computeCost(VF, Ctx);
   }
 
-  InstructionCost Cost = 0;
   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
   // here because the EVL recipes using EVL to replace the tail mask. But in the
   // legacy model, it will always calculate the cost of mask.
   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
-  // don't need to care the legacy cost model.
-  Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
-                                        AS, CostKind);
+  // don't need to compare to the legacy cost model.
+  Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+  const Align Alignment =
+      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
+  unsigned AS =
+      getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
+      Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
   if (!Reverse)
     return Cost;
 
-  return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                        cast<VectorType>(Ty), {}, CostKind, 0);
+  return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                       cast<VectorType>(Ty), {}, CostKind, 0);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 18cf497068a224..4495177a54714b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -1,25 +1,144 @@
-; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f
-; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -S | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl -S | FileCheck %s --check-prefixes=EVL
 ; Generated from issue #109468.
+; In this test case, the vector store with tail mask will transfer to the vp intrinsic with EVL.
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
 target triple = "riscv64-unknown-linux-gnu"
 
-define void @lshift_significand(i32 %n, ptr nocapture writeonly %0) local_unnamed_addr #0 {
+define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
+; CHECK-LABEL: define void @lshift_significand(
+; CHECK-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 3, [[SPEC_SELECT]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[VEC_IV]], i32 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP14:%.*]] = sub nuw nsw i64 1, [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -3
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 8, <4 x i1> [[REVERSE]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], 4
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 1, [[IV]]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP24]]
+; CHECK-NEXT:    store i64 0, ptr [[ARRAYIDX13]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; EVL-LABEL: define void @lshift_significand(
+; EVL-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; EVL-NEXT:  [[ENTRY:.*]]:
+; EVL-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
+; EVL-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
+; EVL-NEXT:    [[TMP1:%.*]] = sub i64 3, [[SPEC_SELECT]]
+; EVL-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[TMP1]]
+; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; EVL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; EVL-NEXT:    br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; EVL:       [[VECTOR_PH]]:
+; EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP8]]
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; EVL-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
+; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; EVL:       [[VECTOR_BODY]]:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP1]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 2, i1 true)
+; EVL-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; EVL-NEXT:    [[TMP14:%.*]] = sub nuw nsw i64 1, [[TMP13]]
+; EVL-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP14]]
+; EVL-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; EVL-NEXT:    [[TMP18:%.*]] = mul i64 0, [[TMP17]]
+; EVL-NEXT:    [[TMP19:%.*]] = sub i64 1, [[TMP17]]
+; EVL-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP15]], i64 [[TMP18]]
+; EVL-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[TMP20]], i64 [[TMP19]]
+; EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP12]])
+; EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP21]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP12]])
+; EVL-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP12]] to i64
+; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP22]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL:       [[MIDDLE_BLOCK]]:
+; EVL-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; EVL:       [[SCALAR_PH]]:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
+; EVL-NEXT:    br label %[[LOOP:.*]]
+; EVL:       [[LOOP]]:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 1, [[IV]]
+; EVL-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP24]]
+; EVL-NEXT:    store i64 0, ptr [[ARRAYIDX13]], align 8
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; EVL:       [[EXIT]]:
+; EVL-NEXT:    ret void
+;
 entry:
   %cmp1.peel = icmp eq i32 %n, 0
   %spec.select = select i1 %cmp1.peel, i64 2, i64 0
-  br label %for.body9
+  br label %loop
 
-for.body9:                                        ; preds = %entry, %for.body9
-  %indvars.iv = phi i64 [ %spec.select, %entry ], [ %indvars.iv.next, %for.body9 ]
-  %1 = sub nuw nsw i64 1, %indvars.iv
-  %arrayidx13 = getelementptr [3 x i64], ptr %0, i64 0, i64 %1
+loop:
+  %iv = phi i64 [ %spec.select, %entry ], [ %iv.next, %loop ]
+  %1 = sub nuw nsw i64 1, %iv
+  %arrayidx13 = getelementptr i64, ptr %dst, i64 %1
   store i64 0, ptr %arrayidx13, align 8
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, 3
-  br i1 %exitcond.not, label %for.end16, label %for.body9
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 3
+  br i1 %exitcond.not, label %exit, label %loop
 
-for.end16:                                        ; preds = %for.body9
+exit:
   ret void
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

>From 21e00c87e9551bdae83ccd4910de2776fe4fb160 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Wed, 25 Sep 2024 06:56:30 -0700
Subject: [PATCH 4/4] Address comments

---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   6 +-
 ...orize-force-tail-with-evl-uniform-store.ll | 124 +++++-------------
 2 files changed, 38 insertions(+), 92 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index dc6e47bd90ab2f..3779ec4c2390d0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2366,9 +2366,8 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
 
 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
                                                   VPCostContext &Ctx) const {
-  if (!Consecutive || IsMasked) {
+  if (!Consecutive || IsMasked)
     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
-  }
 
   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
   // here because the EVL recipes using EVL to replace the tail mask. But in the
@@ -2492,9 +2491,8 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
 
 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
                                                    VPCostContext &Ctx) const {
-  if (!Consecutive || IsMasked) {
+  if (!Consecutive || IsMasked)
     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
-  }
 
   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
   // here because the EVL recipes using EVL to replace the tail mask. But in the
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 4495177a54714b..870925950ae498 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -S | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl -S | FileCheck %s --check-prefixes=EVL
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl -S | FileCheck %s
 ; Generated from issue #109468.
 ; In this test case, the vector store with tail mask will transfer to the vp intrinsic with EVL.
 
@@ -13,31 +12,45 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 3, [[SPEC_SELECT]]
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 3, [[SPEC_SELECT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 -1, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP0]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 2, i1 true)
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[VEC_IV]], i32 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 [[TMP1]])
-; CHECK-NEXT:    [[TMP14:%.*]] = sub nuw nsw i64 1, [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -3
-; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 8, <4 x i1> [[REVERSE]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], 4
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = sub nuw nsw i64 1, [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 0, [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sub i64 1, [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]]
+; CHECK-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP20]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -45,8 +58,8 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 1, [[IV]]
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP24]]
+; CHECK-NEXT:    [[TMP23:%.*]] = sub nuw nsw i64 1, [[IV]]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP23]]
 ; CHECK-NEXT:    store i64 0, ptr [[ARRAYIDX13]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
@@ -54,66 +67,6 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
-; EVL-LABEL: define void @lshift_significand(
-; EVL-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
-; EVL-NEXT:  [[ENTRY:.*]]:
-; EVL-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
-; EVL-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
-; EVL-NEXT:    [[TMP1:%.*]] = sub i64 3, [[SPEC_SELECT]]
-; EVL-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[TMP1]]
-; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; EVL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
-; EVL-NEXT:    br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; EVL:       [[VECTOR_PH]]:
-; EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
-; EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
-; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP8]]
-; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
-; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; EVL-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
-; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
-; EVL:       [[VECTOR_BODY]]:
-; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP1]], [[EVL_BASED_IV]]
-; EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 2, i1 true)
-; EVL-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
-; EVL-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
-; EVL-NEXT:    [[TMP14:%.*]] = sub nuw nsw i64 1, [[TMP13]]
-; EVL-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP14]]
-; EVL-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
-; EVL-NEXT:    [[TMP18:%.*]] = mul i64 0, [[TMP17]]
-; EVL-NEXT:    [[TMP19:%.*]] = sub i64 1, [[TMP17]]
-; EVL-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP15]], i64 [[TMP18]]
-; EVL-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[TMP20]], i64 [[TMP19]]
-; EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP12]])
-; EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP21]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP12]])
-; EVL-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP12]] to i64
-; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP22]], [[EVL_BASED_IV]]
-; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
-; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; EVL:       [[MIDDLE_BLOCK]]:
-; EVL-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; EVL:       [[SCALAR_PH]]:
-; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
-; EVL-NEXT:    br label %[[LOOP:.*]]
-; EVL:       [[LOOP]]:
-; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; EVL-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 1, [[IV]]
-; EVL-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP24]]
-; EVL-NEXT:    store i64 0, ptr [[ARRAYIDX13]], align 8
-; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
-; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; EVL:       [[EXIT]]:
-; EVL-NEXT:    ret void
-;
 entry:
   %cmp1.peel = icmp eq i32 %n, 0
   %spec.select = select i1 %cmp1.peel, i64 2, i64 0
@@ -137,8 +90,3 @@ exit:
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ;.
-; EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-;.