[llvm] [LV] Convert scatter w/uniform addr and mask being header mask to scalar store. (PR #172799)

Mon Feb 9 22:14:42 PST 2026

https://github.com/ElvisWang123 updated https://github.com/llvm/llvm-project/pull/172799

>From cd64c614203ce80026baf98a509399f9db0136f8 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Wed, 17 Dec 2025 21:35:58 -0800
Subject: [PATCH 1/4] [LV] Convert scatter w/uniform addr and mask being header
 mask to scalar store.

This patch converts scatter with uniform address and the mask being the
header mask to the `last-active-lane` + `extract-lane` + `scalar store`.

The header mask can guarantee that the scatter has at least one
active lane and can be converted to the scalar store.

Note that some dead instructions will be generated by
`extract-lane`. Will open another PR to clean up.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  27 ++-
 .../LoopVectorize/AArch64/masked-call.ll      | 149 ++++++++++---
 .../LoopVectorize/AArch64/sve-tail-folding.ll |  11 +-
 .../Transforms/LoopVectorize/RISCV/cse.ll     |  18 +-
 .../RISCV/gather-scatter-cost.ll              |  98 ++++----
 .../LoopVectorize/RISCV/induction-costs.ll    |   9 +-
 .../RISCV/pointer-induction-rv32.ll           |  10 +-
 .../LoopVectorize/RISCV/pointer-induction.ll  |  34 ++-
 .../LoopVectorize/RISCV/pr154103.ll           |  58 +++--
 .../Transforms/LoopVectorize/RISCV/pr88802.ll |  44 ++--
 .../truncate-to-minimal-bitwidth-evl-crash.ll |  35 ++-
 .../RISCV/type-info-cache-evl-crash.ll        |  11 +-
 .../LoopVectorize/RISCV/uniform-load-store.ll | 211 +++++++++++++++++-
 .../vf-will-not-generate-any-vector-insts.ll  |  43 +---
 14 files changed, 541 insertions(+), 217 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9a89145fbf775..c6e65c8f4e6ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1620,8 +1620,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
         continue;
 
-      // Convert an unmasked scatter with an uniform address into
-      // extract-last-lane + scalar store.
+      // Convert an unmasked or header masked scatter with an uniform address
+      // into extract-last-lane + scalar store.
       // TODO: Add a profitability check comparing the cost of a scatter vs.
       // extract + scalar store.
       auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
@@ -1631,14 +1631,25 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
                "Not consecutive memory recipes shouldn't be reversed");
         VPValue *Mask = WidenStoreR->getMask();
 
-        // Only convert the scatter to a scalar store if it is unmasked.
-        // TODO: Support converting scatter masked by the header mask to scalar
-        // store.
-        if (Mask)
+        // Convert the scatter to a scalar store if it is unmasked or header
+        // masked.
+        if (Mask && !vputils::isHeaderMask(Mask, Plan))
           continue;
 
-        auto *Extract = new VPInstruction(VPInstruction::ExtractLastLane,
-                                          {WidenStoreR->getOperand(1)});
+        VPInstruction *Extract;
+        if (!Mask) {
+          Extract = new VPInstruction(VPInstruction::ExtractLastLane,
+                                      {WidenStoreR->getOperand(1)});
+        } else {
+          // If the mask is the header mask, this mask contains at least one
+          // active lane. So it is safe to convert the scatter to a scalar
+          // store.
+          VPInstruction *Idx =
+              new VPInstruction(VPInstruction::LastActiveLane, Mask);
+          Idx->insertBefore(WidenStoreR);
+          Extract = new VPInstruction(VPInstruction::ExtractLane,
+                                      {Idx, WidenStoreR->getOperand(1)});
+        }
         Extract->insertBefore(WidenStoreR);
 
         // TODO: Sink the scalar store recipe to middle block if possible.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 00c7e6eecfb2c..7b9be2f4937aa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -911,51 +911,130 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFNONE:       [[END]]:
 ; TFNONE-NEXT:    ret void
 ;
-; TFCOMMON-LABEL: define void @test_widen_exp_v2(
-; TFCOMMON-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
-; TFCOMMON-NEXT:  [[ENTRY:.*]]:
-; TFCOMMON-NEXT:    br label %[[LOOP:.*]]
-; TFCOMMON:       [[LOOP]]:
-; TFCOMMON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; TFCOMMON-NEXT:    [[LD:%.*]] = load double, ptr [[P2]], align 8
-; TFCOMMON-NEXT:    [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR7:[0-9]+]]
-; TFCOMMON-NEXT:    [[COND1:%.*]] = fcmp ogt double [[EXP]], 0.000000e+00
-; TFCOMMON-NEXT:    [[SINK:%.*]] = select i1 [[COND1]], double 0.000000e+00, double 1.000000e+00
-; TFCOMMON-NEXT:    store double [[SINK]], ptr [[P]], align 8
-; TFCOMMON-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; TFCOMMON-NEXT:    [[COND2:%.*]] = icmp eq i64 [[IV]], [[N]]
-; TFCOMMON-NEXT:    br i1 [[COND2]], label %[[END:.*]], label %[[LOOP]]
-; TFCOMMON:       [[END]]:
-; TFCOMMON-NEXT:    ret void
+; TFALWAYS-LABEL: define void @test_widen_exp_v2(
+; TFALWAYS-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; TFALWAYS-NEXT:  [[ENTRY:.*]]:
+; TFALWAYS-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; TFALWAYS-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; TFALWAYS-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
+; TFALWAYS-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; TFALWAYS-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; TFALWAYS-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP0]], [[TMP4]]
+; TFALWAYS-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[TMP0]], [[TMP4]]
+; TFALWAYS-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 0
+; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]])
+; TFALWAYS-NEXT:    br label %[[VECTOR_BODY:.*]]
+; TFALWAYS:       [[VECTOR_BODY]]:
+; TFALWAYS-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TFALWAYS-NEXT:    [[TMP8:%.*]] = load double, ptr [[P2]], align 8
+; TFALWAYS-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP8]], i64 0
+; TFALWAYS-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+; TFALWAYS-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    [[TMP10:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP9]], zeroinitializer
+; TFALWAYS-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
+; TFALWAYS-NEXT:    [[TMP11:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true)
+; TFALWAYS-NEXT:    [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP11]], i1 false)
+; TFALWAYS-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
+; TFALWAYS-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]]
+; TFALWAYS-NEXT:    store double [[TMP14]], ptr [[P]], align 8
+; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP7]])
+; TFALWAYS-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; TFALWAYS-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP15]], true
+; TFALWAYS-NEXT:    br i1 [[TMP16]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TFALWAYS:       [[END]]:
+; TFALWAYS-NEXT:    ret void
+;
+; TFFALLBACK-LABEL: define void @test_widen_exp_v2(
+; TFFALLBACK-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; TFFALLBACK-NEXT:  [[ENTRY:.*]]:
+; TFFALLBACK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; TFFALLBACK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; TFFALLBACK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
+; TFFALLBACK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; TFFALLBACK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; TFFALLBACK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP0]], [[TMP4]]
+; TFFALLBACK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[TMP0]], [[TMP4]]
+; TFFALLBACK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 0
+; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]])
+; TFFALLBACK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; TFFALLBACK:       [[VECTOR_BODY]]:
+; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TFFALLBACK-NEXT:    [[TMP8:%.*]] = load double, ptr [[P2]], align 8
+; TFFALLBACK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP8]], i64 0
+; TFFALLBACK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+; TFFALLBACK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    [[TMP10:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP9]], zeroinitializer
+; TFFALLBACK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
+; TFFALLBACK-NEXT:    [[TMP11:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true)
+; TFFALLBACK-NEXT:    [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP11]], i1 false)
+; TFFALLBACK-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
+; TFFALLBACK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]]
+; TFFALLBACK-NEXT:    store double [[TMP14]], ptr [[P]], align 8
+; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP7]])
+; TFFALLBACK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; TFFALLBACK-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP15]], true
+; TFFALLBACK-NEXT:    br i1 [[TMP16]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TFFALLBACK:       [[END]]:
+; TFFALLBACK-NEXT:    ret void
 ;
 ; TFA_INTERLEAVE-LABEL: define void @test_widen_exp_v2(
 ; TFA_INTERLEAVE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
 ; TFA_INTERLEAVE-NEXT:  [[ENTRY:.*]]:
 ; TFA_INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; TFA_INTERLEAVE-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], 2
-; TFA_INTERLEAVE-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = shl nuw i64 [[TMP7]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[TMP10]]
+; TFA_INTERLEAVE-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], [[TMP10]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
-; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ult i64 0, [[TMP0]]
-; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[TMP0]]
+; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]])
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[TMP0]])
 ; TFA_INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFA_INTERLEAVE:       [[VECTOR_BODY]]:
-; TFA_INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP9:.*]] ]
-; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP9]] ]
-; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP9]] ]
+; TFA_INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK3:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT3:%.*]], %[[VECTOR_BODY]] ]
 ; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2]], align 8
-; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
-; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00
-; TFA_INTERLEAVE-NEXT:    [[PREDPHI3:%.*]] = select i1 [[TMP8]], double 0.000000e+00, double 1.000000e+00
-; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = or i1 [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK2]]
-; TFA_INTERLEAVE-NEXT:    br i1 [[TMP14]], label %[[BB8:.*]], label %[[TMP9]]
-; TFA_INTERLEAVE:       [[BB8]]:
+; TFA_INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP4]], i64 0
+; TFA_INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]])
+; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP11]], zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP13]], zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
+; TFA_INTERLEAVE-NEXT:    [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP14]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
+; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], splat (i1 true)
+; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]], splat (i1 true)
+; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 2
+; TFA_INTERLEAVE-NEXT:    [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP16]], i1 false)
+; TFA_INTERLEAVE-NEXT:    [[TMP30:%.*]] = add i64 [[TMP29]], [[FIRST_INACTIVE_LANE]]
+; TFA_INTERLEAVE-NEXT:    [[FIRST_INACTIVE_LANE4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP15]], i1 false)
+; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = icmp ne i64 [[FIRST_INACTIVE_LANE4]], [[TMP29]]
+; TFA_INTERLEAVE-NEXT:    [[TMP31:%.*]] = select i1 [[TMP20]], i64 [[FIRST_INACTIVE_LANE4]], i64 [[TMP30]]
+; TFA_INTERLEAVE-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP31]], 1
+; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]]
+; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = sub i64 [[LAST_ACTIVE_LANE]], [[TMP23]]
+; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI4]], i64 [[TMP25]]
+; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = icmp uge i64 [[LAST_ACTIVE_LANE]], [[TMP23]]
+; TFA_INTERLEAVE-NEXT:    [[PREDPHI3:%.*]] = select i1 [[TMP27]], double [[TMP26]], double [[TMP24]]
 ; TFA_INTERLEAVE-NEXT:    store double [[PREDPHI3]], ptr [[P]], align 8
-; TFA_INTERLEAVE-NEXT:    br label %[[TMP9]]
-; TFA_INTERLEAVE:       [[TMP9]]:
-; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 1
-; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]]
-; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT6]] = icmp ult i64 [[TMP20]], [[TMP3]]
+; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
+; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT1]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT3]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP19]], i64 [[TMP3]])
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT1]], i32 0
 ; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT]], true
 ; TFA_INTERLEAVE-NEXT:    br i1 [[TMP21]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; TFA_INTERLEAVE:       [[END]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index b9dfdaf9125ce..c4894f5ddc346 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -396,15 +396,20 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true)
+; CHECK-NEXT:    [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP14]], i1 false)
+; CHECK-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], i64 [[LAST_ACTIVE_LANE]]
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/cse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/cse.ll
index be59bc1769eac..fb56cec217d88 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/cse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/cse.ll
@@ -10,10 +10,6 @@ define i32 @widenpointerinduction_evl_cse(ptr noalias %p0, ptr noalias %p1) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[P0]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[P1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P0]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
@@ -24,8 +20,18 @@ define i32 @widenpointerinduction_evl_cse(ptr noalias %p0, ptr noalias %p1) {
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI3]], <vscale x 4 x i32> [[TMP1]]
 ; CHECK-NEXT:    [[VECTOR_GEP4:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i32> [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> [[VECTOR_GEP4]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP2]])
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> [[VECTOR_GEP]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT2]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP13]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x ptr> [[VECTOR_GEP4]], i64 [[TMP14]]
+; CHECK-NEXT:    store ptr [[TMP8]], ptr [[P0]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x ptr> [[VECTOR_GEP]], i64 [[TMP14]]
+; CHECK-NEXT:    store ptr [[TMP12]], ptr [[P1]], align 4
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 [[TMP3]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
index fabab210fb850..b0dcbb119bd63 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
@@ -31,17 +31,21 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[COND:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT1]], splat (i1 true)
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[BOXES]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[NBRBOXES]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[BOXES]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP13]], i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT4]], <vscale x 4 x i1> [[TMP13]], i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[COND]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = sub i64 [[TMP12]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i32> [[PREDPHI]], i64 [[TMP18]]
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[NBRBOXES]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -160,39 +164,55 @@ exit:
 }
 
 define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2, ptr %p3, i64 %N) {
-; CHECK-LABEL: @store_to_addr_generated_from_invariant_addr(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P0:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP0]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP4]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P1:%.*]], <vscale x 2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[P2:%.*]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P3:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP8]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x ptr> align 1 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; RVA23-LABEL: @store_to_addr_generated_from_invariant_addr(
+; RVA23-NEXT:  entry:
+; RVA23-NEXT:    br label [[LOOP:%.*]]
+; RVA23:       loop:
+; RVA23-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; RVA23-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P1:%.*]], i64 [[IV]]
+; RVA23-NEXT:    store ptr [[P0:%.*]], ptr [[ARRAYIDX11]], align 8
+; RVA23-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P2:%.*]], align 4
+; RVA23-NEXT:    [[BITS_TO_GO:%.*]] = getelementptr i8, ptr [[P3:%.*]], i64 [[TMP0]]
+; RVA23-NEXT:    store i32 0, ptr [[BITS_TO_GO]], align 4
+; RVA23-NEXT:    store i32 0, ptr [[BITS_TO_GO]], align 4
+; RVA23-NEXT:    store i8 0, ptr [[BITS_TO_GO]], align 1
+; RVA23-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; RVA23-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[N:%.*]]
+; RVA23-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; RVA23:       exit:
+; RVA23-NEXT:    ret void
+;
+; RVA23ZVL1024B-LABEL: @store_to_addr_generated_from_invariant_addr(
+; RVA23ZVL1024B-NEXT:  entry:
+; RVA23ZVL1024B-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; RVA23ZVL1024B-NEXT:    br label [[VECTOR_PH:%.*]]
+; RVA23ZVL1024B:       vector.ph:
+; RVA23ZVL1024B-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P0:%.*]], i64 0
+; RVA23ZVL1024B-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; RVA23ZVL1024B-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; RVA23ZVL1024B-NEXT:    br label [[VECTOR_BODY:%.*]]
+; RVA23ZVL1024B:       vector.body:
+; RVA23ZVL1024B-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; RVA23ZVL1024B-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP0]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; RVA23ZVL1024B-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; RVA23ZVL1024B-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; RVA23ZVL1024B-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP3]], i64 0
+; RVA23ZVL1024B-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; RVA23ZVL1024B-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[P1:%.*]], <vscale x 2 x i64> [[VEC_IND]]
+; RVA23ZVL1024B-NEXT:    call void @llvm.vp.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> align 8 [[TMP4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP2]])
+; RVA23ZVL1024B-NEXT:    [[TMP5:%.*]] = load i64, ptr [[P2:%.*]], align 4
+; RVA23ZVL1024B-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[P3:%.*]], i64 [[TMP5]]
+; RVA23ZVL1024B-NEXT:    store i32 0, ptr [[TMP6]], align 4
+; RVA23ZVL1024B-NEXT:    store i32 0, ptr [[TMP6]], align 4
+; RVA23ZVL1024B-NEXT:    store i8 0, ptr [[TMP6]], align 1
+; RVA23ZVL1024B-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
+; RVA23ZVL1024B-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; RVA23ZVL1024B-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; RVA23ZVL1024B-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RVA23ZVL1024B:       middle.block:
+; RVA23ZVL1024B-NEXT:    br label [[EXIT:%.*]]
+; RVA23ZVL1024B:       exit:
+; RVA23ZVL1024B-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -250,7 +270,7 @@ define i8 @mixed_gather_scatters(ptr %A, ptr %B, ptr %C) #0 {
 ; RVA23-NEXT:    [[TMP14]] = call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> [[TMP13]], <vscale x 2 x i8> [[VEC_PHI]], i32 [[TMP0]])
 ; RVA23-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP0]]
 ; RVA23-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; RVA23-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; RVA23-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RVA23:       middle.block:
 ; RVA23-NEXT:    [[TMP16:%.*]] = call i8 @llvm.vector.reduce.or.nxv2i8(<vscale x 2 x i8> [[TMP14]])
 ; RVA23-NEXT:    br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
index 66a7493b067c8..9cff9a8c3a9a8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -126,8 +126,6 @@ define void @test_3_inductions(ptr noalias %dst, ptr noalias %src, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[DST]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <vscale x 2 x i32> [[TMP1]], splat (i32 2)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i32> splat (i32 1), [[TMP2]]
@@ -143,8 +141,13 @@ define void @test_3_inductions(ptr noalias %dst, ptr noalias %src, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <vscale x 2 x i32> [[VEC_IND1]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <vscale x 2 x i32> [[TMP5]] to <vscale x 2 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], <vscale x 2 x i64> [[TMP6]]
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[TMP7]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP8]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 2 x ptr> [[TMP7]], i64 [[TMP14]]
+; CHECK-NEXT:    store ptr [[TMP13]], ptr [[DST]], align 8
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <vscale x 2 x i32> [[VEC_IND1]], [[BROADCAST_SPLAT3]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction-rv32.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction-rv32.ll
index 97bf31d77e00e..06b00b3f90df8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction-rv32.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction-rv32.ll
@@ -9,8 +9,6 @@ define i32 @widenpointerinduction_evl(ptr noalias %p) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[P]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
@@ -18,7 +16,13 @@ define i32 @widenpointerinduction_evl(ptr noalias %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i32> [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> [[VECTOR_GEP]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP8]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 4 x ptr> [[VECTOR_GEP]], i64 [[TMP3]]
+; CHECK-NEXT:    store ptr [[TMP7]], ptr [[P]], align 4
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll
index 786ef735fc7ad..0571a3feae6e0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll
@@ -13,10 +13,6 @@ define void @ptr_induction(ptr %p, ptr noalias %q, ptr noalias %p.end) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[P2]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[Q]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
@@ -29,8 +25,17 @@ define void @ptr_induction(ptr %p, ptr noalias %q, ptr noalias %p.end) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint <vscale x 2 x ptr> [[VECTOR_GEP]] to <vscale x 2 x i64>
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP4]])
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x i64> [[TMP6]], i64 [[TMP11]]
+; CHECK-NEXT:    store i64 [[TMP10]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x i64> [[VEC_IND]], i64 [[TMP11]]
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[P]], align 8
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
 ; CHECK-NEXT:    [[PTR_IND7]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP5]]
@@ -85,8 +90,6 @@ define i1 @scalarize_ptr_induction(ptr %start, ptr %end, ptr noalias %dst, i1 %c
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[DST]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[END]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT6]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -102,17 +105,24 @@ define i1 @scalarize_ptr_induction(ptr %start, ptr %end, ptr noalias %dst, i1 %c
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 2 x i32> [[TMP18]] to <vscale x 2 x i64>
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 2 x i64> [[TMP19]], splat (i64 -7070675565921424023)
 ; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 2 x i64> [[TMP20]], splat (i64 -4)
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP21]], <vscale x 2 x ptr> align 1 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP29:%.*]] = sub i64 [[TMP26]], 1
+; CHECK-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 2
+; CHECK-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <vscale x 2 x i64> [[TMP21]], i64 [[TMP29]]
+; CHECK-NEXT:    store i64 [[TMP33]], ptr [[DST]], align 1, !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr nusw i8, <vscale x 2 x ptr> [[VECTOR_GEP]], i64 12
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq <vscale x 2 x ptr> [[TMP16]], [[BROADCAST_SPLAT7]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = mul i64 12, [[TMP26]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr nusw i8, <vscale x 2 x ptr> [[VECTOR_GEP]], i64 12
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq <vscale x 2 x ptr> [[TMP30]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT:    [[TMP29:%.*]] = sub i64 [[TMP26]], 1
+; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 0
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 2 x i1> [[TMP17]], i64 [[TMP29]]
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
index c35a3d7b9269f..470e570db4969 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
@@ -6,29 +6,47 @@
 define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d) {
 ; CHECK-LABEL: define void @pr154103(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[X]] to i64
-; CHECK-NEXT:    [[DIV:%.*]] = sdiv i64 0, [[CONV]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[DIV]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label %[[THEN:.*]], label %[[LATCH]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[B]], align 1
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[Y]] to i64
-; CHECK-NEXT:    [[NOT:%.*]] = xor i64 [[ZEXT]], 0
-; CHECK-NEXT:    br label %[[LATCH]]
-; CHECK:       [[LATCH]]:
-; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ [[NOT]], %[[THEN]] ], [ 0, %[[LOOP]] ]
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[COND]] to i16
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <vscale x 4 x i64> [[TMP0]], splat (i64 7)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> splat (i64 1), [[TMP1]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ -7905747460161236406, %[[VECTOR_PH]] ], [ [[IV:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 7, [[TMP3]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i8> @llvm.vp.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 [[TMP5]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> [[TMP6]], <vscale x 4 x i64> splat (i64 1), i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP8:%.*]] = sdiv <vscale x 4 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <vscale x 4 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i8> @llvm.vp.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP9]], i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_GATHER3]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <vscale x 4 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i64> [[TMP11]], <vscale x 4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc <vscale x 4 x i64> [[PREDPHI]] to <vscale x 4 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 0
+; CHECK-NEXT:    [[TRUNC:%.*]] = extractelement <vscale x 4 x i16> [[TMP12]], i64 [[TMP13]]
 ; CHECK-NEXT:    store i16 [[TRUNC]], ptr [[C]], align 2
 ; CHECK-NEXT:    store i32 0, ptr [[D]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 7
+; CHECK-NEXT:    [[IV]] = sub nuw i64 [[AVL]], [[TMP3]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index d1c87eec16189..fa3ba7a8bdf1d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -7,31 +7,35 @@ define void @test(ptr %p, i64 %a, i8 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[B]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[A]], 48
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP0]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = ashr <vscale x 2 x i64> [[BROADCAST_SPLAT2]], splat (i64 52)
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i64> [[TMP6]] to <vscale x 2 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <vscale x 2 x i8> [[BROADCAST_SPLAT]] to <vscale x 2 x i32>
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <vscale x 8 x i64> [[BROADCAST_SPLAT2]], splat (i64 52)
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 8 x i64> [[TMP1]] to <vscale x 8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 8 x i8> [[BROADCAST_SPLAT]] to <vscale x 8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i32 [ 9, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[FOR_COND]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP11]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 2 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i32> [[TMP8]], <vscale x 2 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shl <vscale x 2 x i32> [[PREDPHI]], splat (i32 8)
-; CHECK-NEXT:    [[TMP17:%.*]] = trunc <vscale x 2 x i32> [[TMP16]] to <vscale x 2 x i8>
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> [[TMP17]], <vscale x 2 x ptr> align 1 [[BROADCAST_SPLAT4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 8, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP11]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <vscale x 8 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i32> [[TMP3]], <vscale x 8 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shl <vscale x 8 x i32> [[PREDPHI]], splat (i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 [[TMP10]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP12]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP16]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 8 x i8> [[TMP9]], i64 [[TMP15]]
+; CHECK-NEXT:    store i8 [[TMP13]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP11]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT8]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
index a1b8cbbabeece..5bd03b43b837a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
@@ -8,20 +8,20 @@
 define void @truncate_to_minimal_bitwidths_widen_cast_recipe(ptr %src) {
 ; CHECK-LABEL: define void @truncate_to_minimal_bitwidths_widen_cast_recipe(
 ; CHECK-SAME: ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 9, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x ptr> align 1 zeroinitializer, <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL16:%.*]] = mul i32 0, [[CONV]]
+; CHECK-NEXT:    [[SHR35:%.*]] = lshr i32 [[MUL16]], 1
+; CHECK-NEXT:    [[CONV36:%.*]] = trunc i32 [[SHR35]] to i8
+; CHECK-NEXT:    store i8 [[CONV36]], ptr null, align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 8
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -75,7 +75,7 @@ define void @truncate_i16_to_i8_cse(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    store i8 [[TMP10]], ptr [[DST]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4294967296, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -95,7 +95,7 @@ define void @truncate_i16_to_i8_cse(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[COUNT_NEXT]] = add i32 [[COUNT]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[COUNT_NEXT]], 0
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -124,6 +124,5 @@ exit:                                             ; preds = %loop
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
index df848f2db917f..da65846c75e16 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
@@ -20,8 +20,6 @@ define void @type_info_cache_clobber(ptr %dstv, ptr %src, i64 %wide.trip.count)
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x ptr> poison, ptr [[DSTV]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 8 x ptr> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -34,9 +32,14 @@ define void @type_info_cache_clobber(ptr %dstv, ptr %src, i64 %wide.trip.count)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult <vscale x 8 x i32> [[TMP15]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = select <vscale x 8 x i1> [[TMP16]], <vscale x 8 x i32> [[TMP23]], <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = trunc <vscale x 8 x i32> [[TMP17]] to <vscale x 8 x i8>
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP24]], <vscale x 8 x ptr> align 1 [[BROADCAST_SPLAT]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 zeroinitializer, <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP20]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP19]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 8 x i8> [[TMP24]], i64 [[TMP10]]
+; CHECK-NEXT:    store i8 [[TMP14]], ptr [[DSTV]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    store i16 0, ptr null, align 2
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index adda7c362b8e8..536e41650ed94 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -549,8 +549,6 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-NEXT:  [[ENTRY:.*:]]
 ; SCALABLE-NEXT:    br label %[[VECTOR_PH:.*]]
 ; SCALABLE:       [[VECTOR_PH]]:
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
@@ -563,7 +561,12 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
 ; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
 ; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; SCALABLE-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP8]], 1
+; SCALABLE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP5]], 2
+; SCALABLE-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP11]], 0
+; SCALABLE-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[VEC_IND]], i64 [[TMP4]]
+; SCALABLE-NEXT:    store i64 [[TMP12]], ptr [[B]], align 8
 ; SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
 ; SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP16]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[TMP10]]
@@ -618,8 +621,6 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:  [[ENTRY:.*:]]
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_PH:.*]]
 ; TF-SCALABLE:       [[VECTOR_PH]]:
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
@@ -632,7 +633,12 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP9]] to i64
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP13]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; TF-SCALABLE-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP13]], 1
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP8]], 2
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP6]], 0
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 2 x i64> [[VEC_IND]], i64 [[TMP4]]
+; TF-SCALABLE-NEXT:    store i64 [[TMP7]], ptr [[B]], align 8
 ; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT3]], ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]]
@@ -896,6 +902,199 @@ for.end:
   ret void
 }
 
+define void @uniform_load_store(ptr %p, ptr %q, i32 %n) {
+; SCALABLE-LABEL: define void @uniform_load_store(
+; SCALABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; SCALABLE-NEXT:  [[ENTRY:.*:]]
+; SCALABLE-NEXT:    br label %[[VECTOR_SCEVCHECK:.*]]
+; SCALABLE:       [[VECTOR_SCEVCHECK]]:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; SCALABLE-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 0
+; SCALABLE-NEXT:    br i1 [[TMP1]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; SCALABLE:       [[VECTOR_MEMCHECK]]:
+; SCALABLE-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[Q]], i64 4
+; SCALABLE-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; SCALABLE-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; SCALABLE-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; SCALABLE-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 4
+; SCALABLE-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP5]]
+; SCALABLE-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[Q]], [[SCEVGEP1]]
+; SCALABLE-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[P]], [[SCEVGEP]]
+; SCALABLE-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; SCALABLE-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; SCALABLE:       [[VECTOR_PH]]:
+; SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; SCALABLE:       [[VECTOR_BODY]]:
+; SCALABLE-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[AVL:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
+; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[P]], i32 [[EVL_BASED_IV]]
+; SCALABLE-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]]), !alias.scope [[META10:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 1)
+; SCALABLE-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]]), !alias.scope [[META10]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
+; SCALABLE-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP9]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; SCALABLE-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 0
+; SCALABLE-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i32> [[TMP8]], i64 [[TMP10]]
+; SCALABLE-NEXT:    store i32 [[TMP14]], ptr [[Q]], align 4, !alias.scope [[META13:![0-9]+]], !noalias [[META10]]
+; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]]
+; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; SCALABLE-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; SCALABLE:       [[MIDDLE_BLOCK]]:
+; SCALABLE-NEXT:    br label %[[EXIT:.*]]
+; SCALABLE:       [[SCALAR_PH]]:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; SCALABLE-NEXT:    br label %[[LOOP:.*]]
+; SCALABLE:       [[LOOP]]:
+; SCALABLE-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; SCALABLE-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; SCALABLE-NEXT:    [[X:%.*]] = load i32, ptr [[GEP]], align 4
+; SCALABLE-NEXT:    [[Y:%.*]] = add i32 [[X]], 1
+; SCALABLE-NEXT:    store i32 [[Y]], ptr [[GEP]], align 4
+; SCALABLE-NEXT:    store i32 [[Y]], ptr [[Q]], align 4
+; SCALABLE-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; SCALABLE-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; SCALABLE:       [[EXIT]]:
+; SCALABLE-NEXT:    ret void
+;
+; FIXEDLEN-LABEL: define void @uniform_load_store(
+; FIXEDLEN-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; FIXEDLEN-NEXT:  [[ENTRY:.*]]:
+; FIXEDLEN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 17
+; FIXEDLEN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; FIXEDLEN:       [[VECTOR_SCEVCHECK]]:
+; FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 0
+; FIXEDLEN-NEXT:    br i1 [[TMP1]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; FIXEDLEN:       [[VECTOR_MEMCHECK]]:
+; FIXEDLEN-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[Q]], i64 4
+; FIXEDLEN-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; FIXEDLEN-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; FIXEDLEN-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; FIXEDLEN-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 4
+; FIXEDLEN-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP5]]
+; FIXEDLEN-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[Q]], [[SCEVGEP1]]
+; FIXEDLEN-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[P]], [[SCEVGEP]]
+; FIXEDLEN-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; FIXEDLEN-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; FIXEDLEN:       [[VECTOR_PH]]:
+; FIXEDLEN-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 16
+; FIXEDLEN-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; FIXEDLEN-NEXT:    br label %[[VECTOR_BODY:.*]]
+; FIXEDLEN:       [[VECTOR_BODY]]:
+; FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i32 [[INDEX]]
+; FIXEDLEN-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i64 8
+; FIXEDLEN-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META18:![0-9]+]]
+; FIXEDLEN-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META18]]
+; FIXEDLEN-NEXT:    [[TMP8:%.*]] = add <8 x i32> [[WIDE_LOAD]], splat (i32 1)
+; FIXEDLEN-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[WIDE_LOAD2]], splat (i32 1)
+; FIXEDLEN-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP6]], align 4, !alias.scope [[META18]]
+; FIXEDLEN-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP7]], align 4, !alias.scope [[META18]]
+; FIXEDLEN-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7
+; FIXEDLEN-NEXT:    store i32 [[TMP10]], ptr [[Q]], align 4, !alias.scope [[META21:![0-9]+]], !noalias [[META18]]
+; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; FIXEDLEN-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; FIXEDLEN-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; FIXEDLEN:       [[MIDDLE_BLOCK]]:
+; FIXEDLEN-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; FIXEDLEN-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; FIXEDLEN:       [[SCALAR_PH]]:
+; FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; FIXEDLEN-NEXT:    br label %[[LOOP:.*]]
+; FIXEDLEN:       [[LOOP]]:
+; FIXEDLEN-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; FIXEDLEN-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; FIXEDLEN-NEXT:    [[X:%.*]] = load i32, ptr [[GEP]], align 4
+; FIXEDLEN-NEXT:    [[Y:%.*]] = add i32 [[X]], 1
+; FIXEDLEN-NEXT:    store i32 [[Y]], ptr [[GEP]], align 4
+; FIXEDLEN-NEXT:    store i32 [[Y]], ptr [[Q]], align 4
+; FIXEDLEN-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; FIXEDLEN-NEXT:    [[DONE:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; FIXEDLEN-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
+; FIXEDLEN:       [[EXIT]]:
+; FIXEDLEN-NEXT:    ret void
+;
+; TF-SCALABLE-LABEL: define void @uniform_load_store(
+; TF-SCALABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; TF-SCALABLE-NEXT:  [[ENTRY:.*:]]
+; TF-SCALABLE-NEXT:    br label %[[VECTOR_SCEVCHECK:.*]]
+; TF-SCALABLE:       [[VECTOR_SCEVCHECK]]:
+; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 0
+; TF-SCALABLE-NEXT:    br i1 [[TMP1]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; TF-SCALABLE:       [[VECTOR_MEMCHECK]]:
+; TF-SCALABLE-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[Q]], i64 4
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 4
+; TF-SCALABLE-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP5]]
+; TF-SCALABLE-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[Q]], [[SCEVGEP1]]
+; TF-SCALABLE-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[P]], [[SCEVGEP]]
+; TF-SCALABLE-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; TF-SCALABLE-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; TF-SCALABLE:       [[VECTOR_PH]]:
+; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; TF-SCALABLE:       [[VECTOR_BODY]]:
+; TF-SCALABLE-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[AVL:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[P]], i32 [[EVL_BASED_IV]]
+; TF-SCALABLE-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]]), !alias.scope [[META10:![0-9]+]]
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 1)
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]]), !alias.scope [[META10]]
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP9]], 1
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 0
+; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i32> [[TMP8]], i64 [[TMP10]]
+; TF-SCALABLE-NEXT:    store i32 [[TMP14]], ptr [[Q]], align 4, !alias.scope [[META13:![0-9]+]], !noalias [[META10]]
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]]
+; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]]
+; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; TF-SCALABLE-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
+; TF-SCALABLE-NEXT:    br label %[[EXIT:.*]]
+; TF-SCALABLE:       [[SCALAR_PH]]:
+; TF-SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; TF-SCALABLE-NEXT:    br label %[[LOOP:.*]]
+; TF-SCALABLE:       [[LOOP]]:
+; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; TF-SCALABLE-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; TF-SCALABLE-NEXT:    [[X:%.*]] = load i32, ptr [[GEP]], align 4
+; TF-SCALABLE-NEXT:    [[Y:%.*]] = add i32 [[X]], 1
+; TF-SCALABLE-NEXT:    store i32 [[Y]], ptr [[GEP]], align 4
+; TF-SCALABLE-NEXT:    store i32 [[Y]], ptr [[Q]], align 4
+; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; TF-SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; TF-SCALABLE-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; TF-SCALABLE:       [[EXIT]]:
+; TF-SCALABLE-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  %gep = getelementptr i32, ptr %p, i32 %iv
+  %x = load i32, ptr %gep
+  %y = add i32 %x, 1
+  store i32 %y, ptr %gep
+  store i32 %y, ptr %q
+  %iv.next = add i32 %iv, 1
+  %done = icmp eq i32 %iv.next, %n
+  br i1 %done, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 !0 = distinct !{!0, !1, !2}
 !1 = !{!"llvm.loop.vectorize.width", i32 4}
 !2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
index dca4f47738309..7737214003080 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
@@ -7,41 +7,15 @@ target triple = "riscv64-unknown-unknown-elf"
 define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) {
 ; CHECK-LABEL: define void @vf_will_not_generate_any_vector_insts(
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
-; CHECK:       [[VECTOR_MEMCHECK]]:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 100, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT3]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[TMP3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[SRC]], align 4
 ; CHECK-NEXT:    store i32 [[DOTPRE]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[TMP3]] = add nuw i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[TMP3]], 100
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -59,14 +33,3 @@ loop:
 exit:
   ret void
 }
-;.
-; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
-; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
-; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
-; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
-; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
-; CHECK: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]]}
-;.

>From 0305e5c981a0c0b2aef202b906349a67c25157aa Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Tue, 20 Jan 2026 17:31:41 -0800
Subject: [PATCH 2/4] Use cost model based decision.

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  16 ++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 109 +++++++++++++++++-
 .../Transforms/Vectorize/VPlanTransforms.h    |   5 +
 .../LoopVectorize/AArch64/sve-tail-folding.ll |   3 -
 .../Transforms/LoopVectorize/RISCV/cse.ll     |   6 -
 .../Transforms/LoopVectorize/RISCV/divrem.ll  |   7 +-
 .../RISCV/gather-scatter-cost.ll              |  41 ++++---
 .../LoopVectorize/RISCV/induction-costs.ll    |   3 -
 .../RISCV/pointer-induction-rv32.ll           |   3 -
 .../LoopVectorize/RISCV/pointer-induction.ll  |  16 +--
 .../LoopVectorize/RISCV/pr154103.ll           |  52 +++------
 .../Transforms/LoopVectorize/RISCV/pr88802.ll |   3 -
 .../truncate-to-minimal-bitwidth-evl-crash.ll |  31 ++---
 .../RISCV/type-info-cache-evl-crash.ll        |   3 -
 .../LoopVectorize/RISCV/uniform-load-store.ll |  12 --
 .../vf-will-not-generate-any-vector-insts.ll  |  43 ++++++-
 .../LoopVectorize/vplan-print-after-all.ll    |   1 +
 17 files changed, 231 insertions(+), 123 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 51895d1be27d7..ebee1643fc623 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8127,18 +8127,30 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
     VFRange SubRange = {VF, MaxVFTimes2};
     if (auto Plan = tryToBuildVPlanWithVPRecipes(
             std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
+      // Add the start VF to prevent optimizations on scalar VF.
+      Plan->addVF(SubRange.Start);
+      Plan->setName("Initial VPlan");
       // Now optimize the initial VPlan.
       VPlanTransforms::hoistPredicatedLoads(*Plan, PSE, OrigLoop);
       VPlanTransforms::sinkPredicatedStores(*Plan, PSE, OrigLoop);
       RUN_VPLAN_PASS(VPlanTransforms::truncateToMinimalBitwidths, *Plan,
                      CM.getMinimalBitwidths());
       RUN_VPLAN_PASS(VPlanTransforms::optimize, *Plan);
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
+                            OrigLoop);
+      RUN_VPLAN_PASS(VPlanTransforms::narrowScatters, *Plan, CostCtx, SubRange,
+                     CM.foldTailWithEVL());
+
       // TODO: try to put addExplicitVectorLength close to addActiveLaneMask
       if (CM.foldTailWithEVL()) {
         RUN_VPLAN_PASS(VPlanTransforms::addExplicitVectorLength, *Plan,
                        CM.getMaxSafeElements());
         RUN_VPLAN_PASS(VPlanTransforms::optimizeEVLMasks, *Plan);
       }
+
+      for (ElementCount VF : drop_begin(SubRange))
+        Plan->addVF(VF);
+
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
@@ -8365,10 +8377,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
                    Range);
   }
 
-  for (ElementCount VF : Range)
-    Plan->addVF(VF);
-  Plan->setName("Initial VPlan");
-
   // Interleave memory: for each Interleave Group we marked earlier as relevant
   // for this VPlan, replace the Recipes widening its memory instructions with a
   // single VPInterleaveRecipe at its insertion point.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c6e65c8f4e6ad..a4189afcfc6a2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1602,6 +1602,112 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
   }
 }
 
+void VPlanTransforms::narrowScatters(VPlan &Plan, VPCostContext &Ctx,
+                                     VFRange &Range,
+                                     const bool &FoldTailWithEVL) {
+  if (Plan.hasScalarVFOnly())
+    return;
+
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+    for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
+      if (!isa<VPWidenStoreRecipe>(&R))
+        continue;
+      // Convert an unmasked or header masked scatter with an uniform address
+      // into extract-last-lane + scalar store.
+      // TODO: Add a profitability check comparing the cost of a scatter vs.
+      // extract + scalar store.
+      auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
+      if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
+          !WidenStoreR->isConsecutive()) {
+        assert(!WidenStoreR->isReverse() &&
+               "Not consecutive memory recipes shouldn't be reversed");
+        VPValue *Mask = WidenStoreR->getMask();
+
+        // Convert the scatter to a scalar store if it is unmasked or header
+        // masked.
+        if (Mask && !vputils::isHeaderMask(Mask, Plan))
+          continue;
+
+        VPInstruction *Extract;
+        if (!Mask) {
+          Extract = new VPInstruction(VPInstruction::ExtractLastLane,
+                                      {WidenStoreR->getOperand(1)});
+        } else {
+          // If the mask is the header mask, this mask contains at least one
+          // active lane. So it is safe to convert the scatter to a scalar
+          // store.
+          if (!LoopVectorizationPlanner::getDecisionAndClampRange(
+                  [&](ElementCount VF) {
+                    InstructionCost ScatterCost =
+                        WidenStoreR->computeCost(VF, Ctx);
+                    // ConvertToScalarCost = LastActiveLane + ExtractElement +
+                    // scalar store.
+                    // TODO: Automatically sync with VPInstruction::computeCost.
+                    // LastActiveLane = not + cttz.elts + sub
+                    InstructionCost ScalarCost = 0;
+                    auto *ValTy = Ctx.Types.inferScalarType(
+                        WidenStoreR->getStoredValue());
+
+                    if (!FoldTailWithEVL) {
+                      auto *PredTy =
+                          toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+                      IntrinsicCostAttributes Attrs(
+                          Intrinsic::experimental_cttz_elts,
+                          Type::getInt64Ty(Ctx.LLVMCtx),
+                          {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
+                      ScalarCost +=
+                          Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
+                      ScalarCost += Ctx.TTI.getArithmeticInstrCost(
+                          Instruction::Xor, PredTy, Ctx.CostKind,
+                          {TargetTransformInfo::OK_AnyValue,
+                           TargetTransformInfo::OP_None},
+                          {TargetTransformInfo::OK_UniformConstantValue,
+                           TargetTransformInfo::OP_None});
+                    }
+                    // ExtractElement cost
+                    auto *VecTy = toVectorTy(ValTy, VF);
+                    ScalarCost += Ctx.TTI.getVectorInstrCost(
+                        Instruction::ExtractElement, VecTy, Ctx.CostKind);
+
+                    ScalarCost += Ctx.TTI.getArithmeticInstrCost(
+                        Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx),
+                        Ctx.CostKind);
+
+                    // Scalar store cost
+                    Instruction &I = WidenStoreR->getIngredient();
+                    unsigned AS = getLoadStoreAddressSpace(&I);
+                    TTI::OperandValueInfo OpInfo =
+                        TTI::getOperandInfo(I.getOperand(0));
+                    ScalarCost += Ctx.TTI.getMemoryOpCost(
+                        Instruction::Store, ValTy, WidenStoreR->getAlign(), AS,
+                        Ctx.CostKind, OpInfo, &I);
+
+                    return ScalarCost.isValid() && ScalarCost <= ScatterCost;
+                  },
+                  Range))
+            continue;
+
+          VPInstruction *Idx =
+              new VPInstruction(VPInstruction::LastActiveLane, Mask);
+          Idx->insertBefore(WidenStoreR);
+          Extract = new VPInstruction(VPInstruction::ExtractLane,
+                                      {Idx, WidenStoreR->getOperand(1)});
+        }
+        Extract->insertBefore(WidenStoreR);
+
+        // TODO: Sink the scalar store recipe to middle block if possible.
+        auto *ScalarStore = new VPReplicateRecipe(
+            &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
+            true /*IsSingleScalar*/, nullptr /*Mask*/, {},
+            *WidenStoreR /*Metadata*/);
+        ScalarStore->insertBefore(WidenStoreR);
+        WidenStoreR->eraseFromParent();
+      }
+    }
+  }
+}
+
 static void narrowToSingleScalarRecipes(VPlan &Plan) {
   if (Plan.hasScalarVFOnly())
     return;
@@ -1613,8 +1719,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
-      if (!isa<VPWidenRecipe, VPWidenGEPRecipe, VPReplicateRecipe,
-               VPWidenStoreRecipe>(&R))
+      if (!isa<VPWidenRecipe, VPWidenGEPRecipe, VPReplicateRecipe>(&R))
         continue;
       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 49d15f15ece94..d0c23b812e954 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -460,6 +460,11 @@ struct VPlanTransforms {
   /// are only valid for a subset of VFs in Range, Range.End is updated.
   static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx,
                                       VFRange &Range);
+
+  /// Convert the scatter to extract-last-active-lane + scalar store if
+  /// profitable.
+  static void narrowScatters(VPlan &Plan, VPCostContext &Ctx, VFRange &Range,
+                             const bool &FoldTailWithEVL);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index c4894f5ddc346..fba4ef7274769 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -405,9 +405,6 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n)
 ; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true)
 ; CHECK-NEXT:    [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP14]], i1 false)
 ; CHECK-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP15]], 4
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], i64 [[LAST_ACTIVE_LANE]]
 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/cse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/cse.ll
index fb56cec217d88..2c55ba6e4a8dd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/cse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/cse.ll
@@ -22,14 +22,8 @@ define i32 @widenpointerinduction_evl_cse(ptr noalias %p0, ptr noalias %p1) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP13]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x ptr> [[VECTOR_GEP4]], i64 [[TMP14]]
 ; CHECK-NEXT:    store ptr [[TMP8]], ptr [[P0]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x ptr> [[VECTOR_GEP]], i64 [[TMP14]]
 ; CHECK-NEXT:    store ptr [[TMP12]], ptr [[P1]], align 4
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP2]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index 9feaa4edad29c..e68c705258660 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -650,8 +650,6 @@ define void @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c, ptr %p) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[Y:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[P:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i8> @llvm.stepvector.nxv4i8()
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i8> splat (i8 -12), [[TMP1]]
 ; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
@@ -669,7 +667,10 @@ define void @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c, ptr %p) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = sdiv <vscale x 4 x i16> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i16> [[TMP8]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[TMP9]]
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP13]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x i32> [[PREDPHI]], i64 [[TMP11]]
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP2]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i8> [[VEC_IND]], [[BROADCAST_SPLAT8]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
index b0dcbb119bd63..aaf861259086f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
@@ -41,9 +41,6 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) {
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[COND]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP10]] to i64
 ; CHECK-NEXT:    [[TMP18:%.*]] = sub i64 [[TMP12]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i32> [[PREDPHI]], i64 [[TMP18]]
 ; CHECK-NEXT:    store i32 [[TMP20]], ptr [[NBRBOXES]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP10]]
@@ -166,19 +163,35 @@ exit:
 define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2, ptr %p3, i64 %N) {
 ; RVA23-LABEL: @store_to_addr_generated_from_invariant_addr(
 ; RVA23-NEXT:  entry:
+; RVA23-NEXT:    [[TMP5:%.*]] = add i64 [[N:%.*]], 1
 ; RVA23-NEXT:    br label [[LOOP:%.*]]
-; RVA23:       loop:
-; RVA23-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; RVA23-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P1:%.*]], i64 [[IV]]
-; RVA23-NEXT:    store ptr [[P0:%.*]], ptr [[ARRAYIDX11]], align 8
+; RVA23:       vector.ph:
+; RVA23-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[P0:%.*]], i64 0
+; RVA23-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; RVA23-NEXT:    [[TMP1:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; RVA23-NEXT:    br label [[VECTOR_BODY:%.*]]
+; RVA23:       vector.body:
+; RVA23-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP1]], [[LOOP]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; RVA23-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP5]], [[LOOP]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; RVA23-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
+; RVA23-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; RVA23-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP3]], i64 0
+; RVA23-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; RVA23-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[P1:%.*]], <vscale x 1 x i64> [[VEC_IND]]
+; RVA23-NEXT:    call void @llvm.vp.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> [[BROADCAST_SPLAT]], <vscale x 1 x ptr> align 8 [[TMP4]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP2]])
 ; RVA23-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P2:%.*]], align 4
 ; RVA23-NEXT:    [[BITS_TO_GO:%.*]] = getelementptr i8, ptr [[P3:%.*]], i64 [[TMP0]]
-; RVA23-NEXT:    store i32 0, ptr [[BITS_TO_GO]], align 4
-; RVA23-NEXT:    store i32 0, ptr [[BITS_TO_GO]], align 4
-; RVA23-NEXT:    store i8 0, ptr [[BITS_TO_GO]], align 1
-; RVA23-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; RVA23-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[N:%.*]]
-; RVA23-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; RVA23-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[BITS_TO_GO]], i64 0
+; RVA23-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; RVA23-NEXT:    call void @llvm.vp.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x ptr> align 4 [[BROADCAST_SPLAT4]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP2]])
+; RVA23-NEXT:    call void @llvm.vp.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x ptr> align 4 [[BROADCAST_SPLAT4]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP2]])
+; RVA23-NEXT:    call void @llvm.vp.scatter.nxv1i8.nxv1p0(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x ptr> align 1 [[BROADCAST_SPLAT4]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP2]])
+; RVA23-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
+; RVA23-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; RVA23-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; RVA23-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RVA23:       middle.block:
+; RVA23-NEXT:    br label [[EXIT:%.*]]
 ; RVA23:       exit:
 ; RVA23-NEXT:    ret void
 ;
@@ -270,7 +283,7 @@ define i8 @mixed_gather_scatters(ptr %A, ptr %B, ptr %C) #0 {
 ; RVA23-NEXT:    [[TMP14]] = call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> [[TMP13]], <vscale x 2 x i8> [[VEC_PHI]], i32 [[TMP0]])
 ; RVA23-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP0]]
 ; RVA23-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; RVA23-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RVA23-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; RVA23:       middle.block:
 ; RVA23-NEXT:    [[TMP16:%.*]] = call i8 @llvm.vector.reduce.or.nxv2i8(<vscale x 2 x i8> [[TMP14]])
 ; RVA23-NEXT:    br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
index 9cff9a8c3a9a8..b0659cb2ee6dd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -143,9 +143,6 @@ define void @test_3_inductions(ptr noalias %dst, ptr noalias %src, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], <vscale x 2 x i64> [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP8]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 2
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 2 x ptr> [[TMP7]], i64 [[TMP14]]
 ; CHECK-NEXT:    store ptr [[TMP13]], ptr [[DST]], align 8
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction-rv32.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction-rv32.ll
index 06b00b3f90df8..5440d2106ea6e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction-rv32.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction-rv32.ll
@@ -18,9 +18,6 @@ define i32 @widenpointerinduction_evl(ptr noalias %p) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP8]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 4 x ptr> [[VECTOR_GEP]], i64 [[TMP3]]
 ; CHECK-NEXT:    store ptr [[TMP7]], ptr [[P]], align 4
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll
index 0571a3feae6e0..cab479a5f7a6f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll
@@ -26,14 +26,8 @@ define void @ptr_induction(ptr %p, ptr noalias %q, ptr noalias %p.end) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint <vscale x 2 x ptr> [[VECTOR_GEP]] to <vscale x 2 x i64>
 ; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP5]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP9]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x i64> [[TMP6]], i64 [[TMP11]]
 ; CHECK-NEXT:    store i64 [[TMP10]], ptr [[Q]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP13]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x i64> [[VEC_IND]], i64 [[TMP11]]
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[P]], align 8
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
@@ -107,22 +101,16 @@ define i1 @scalarize_ptr_induction(ptr %start, ptr %end, ptr noalias %dst, i1 %c
 ; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 2 x i64> [[TMP20]], splat (i64 -4)
 ; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP29:%.*]] = sub i64 [[TMP26]], 1
-; CHECK-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 2
-; CHECK-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <vscale x 2 x i64> [[TMP21]], i64 [[TMP29]]
 ; CHECK-NEXT:    store i64 [[TMP33]], ptr [[DST]], align 1, !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr nusw i8, <vscale x 2 x ptr> [[VECTOR_GEP]], i64 12
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq <vscale x 2 x ptr> [[TMP16]], [[BROADCAST_SPLAT7]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = mul i64 12, [[TMP26]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 2
-; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr nusw i8, <vscale x 2 x ptr> [[VECTOR_GEP]], i64 12
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq <vscale x 2 x ptr> [[TMP22]], [[BROADCAST_SPLAT7]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 2 x i1> [[TMP17]], i64 [[TMP29]]
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
index 470e570db4969..3491e22b1ae0d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
@@ -6,47 +6,29 @@
 define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d) {
 ; CHECK-LABEL: define void @pr154103(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <vscale x 4 x i64> [[TMP0]], splat (i64 7)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> splat (i64 1), [[TMP1]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY:.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[X]] to i64
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i64 0, [[CONV]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[DIV]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[THEN:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[B]], align 1
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[Y]] to i64
+; CHECK-NEXT:    [[NOT:%.*]] = xor i64 [[ZEXT]], 0
+; CHECK-NEXT:    br label %[[VECTOR_BODY]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ -7905747460161236406, %[[VECTOR_PH]] ], [ [[IV:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 7, [[TMP3]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP4]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i8> @llvm.vp.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 [[TMP5]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> [[TMP6]], <vscale x 4 x i64> splat (i64 1), i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP8:%.*]] = sdiv <vscale x 4 x i64> zeroinitializer, [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <vscale x 4 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i8> @llvm.vp.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP9]], i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_GATHER3]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <vscale x 4 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i64> [[TMP11]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc <vscale x 4 x i64> [[PREDPHI]] to <vscale x 4 x i16>
-; CHECK-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 0
-; CHECK-NEXT:    [[TRUNC:%.*]] = extractelement <vscale x 4 x i16> [[TMP12]], i64 [[TMP13]]
+; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ [[NOT]], %[[THEN]] ], [ 0, %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[COND]] to i16
 ; CHECK-NEXT:    store i16 [[TRUNC]], ptr [[C]], align 2
 ; CHECK-NEXT:    store i32 0, ptr [[D]], align 4
-; CHECK-NEXT:    [[IV]] = sub nuw i64 [[AVL]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 7
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[DONE]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[VECTOR_PH]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index fa3ba7a8bdf1d..ce2e277c53cde 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -29,9 +29,6 @@ define void @test(ptr %p, i64 %a, i8 %b) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i8>
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 [[TMP10]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP12]], 8
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP16]], 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 8 x i8> [[TMP9]], i64 [[TMP15]]
 ; CHECK-NEXT:    store i8 [[TMP13]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP11]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
index 5bd03b43b837a..189a16d87c18c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
@@ -8,21 +8,21 @@
 define void @truncate_to_minimal_bitwidths_widen_cast_recipe(ptr %src) {
 ; CHECK-LABEL: define void @truncate_to_minimal_bitwidths_widen_cast_recipe(
 ; CHECK-SAME: ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[GEP_SRC]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
-; CHECK-NEXT:    [[MUL16:%.*]] = mul i32 0, [[CONV]]
-; CHECK-NEXT:    [[SHR35:%.*]] = lshr i32 [[MUL16]], 1
-; CHECK-NEXT:    [[CONV36:%.*]] = trunc i32 [[SHR35]] to i8
-; CHECK-NEXT:    store i8 [[CONV36]], ptr null, align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 8
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 9, %[[LOOP]] ], [ [[AVL_NEXT:%.*]], %[[EXIT]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv1i8.nxv1p0(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x ptr> align 1 zeroinitializer, <vscale x 1 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[EXIT]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT1:.*]]
+; CHECK:       [[EXIT1]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -75,7 +75,7 @@ define void @truncate_i16_to_i8_cse(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    store i8 [[TMP10]], ptr [[DST]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4294967296, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -95,7 +95,7 @@ define void @truncate_i16_to_i8_cse(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[COUNT_NEXT]] = add i32 [[COUNT]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[COUNT_NEXT]], 0
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -124,5 +124,6 @@ exit:                                             ; preds = %loop
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
index da65846c75e16..ee05d69fd09c4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
@@ -34,9 +34,6 @@ define void @type_info_cache_clobber(ptr %dstv, ptr %src, i64 %wide.trip.count)
 ; CHECK-NEXT:    [[TMP24:%.*]] = trunc <vscale x 8 x i32> [[TMP17]] to <vscale x 8 x i8>
 ; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP19]], 8
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP12]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 8 x i8> [[TMP24]], i64 [[TMP10]]
 ; CHECK-NEXT:    store i8 [[TMP14]], ptr [[DSTV]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    store i16 0, ptr null, align 2
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 536e41650ed94..c7380190d1d24 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -562,9 +562,6 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
 ; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP8]], 1
-; SCALABLE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP5]], 2
-; SCALABLE-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP11]], 0
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[VEC_IND]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    store i64 [[TMP12]], ptr [[B]], align 8
 ; SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
@@ -634,9 +631,6 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP13]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP13]], 1
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP8]], 2
-; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP6]], 0
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 2 x i64> [[VEC_IND]], i64 [[TMP4]]
 ; TF-SCALABLE-NEXT:    store i64 [[TMP7]], ptr [[B]], align 8
 ; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
@@ -934,9 +928,6 @@ define void @uniform_load_store(ptr %p, ptr %q, i32 %n) {
 ; SCALABLE-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]]), !alias.scope [[META10]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP9]], 1
-; SCALABLE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
-; SCALABLE-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 0
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i32> [[TMP8]], i64 [[TMP10]]
 ; SCALABLE-NEXT:    store i32 [[TMP14]], ptr [[Q]], align 4, !alias.scope [[META13:![0-9]+]], !noalias [[META10]]
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]]
@@ -1050,9 +1041,6 @@ define void @uniform_load_store(ptr %p, ptr %q, i32 %n) {
 ; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]]), !alias.scope [[META10]]
 ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
 ; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP9]], 1
-; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
-; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 0
 ; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i32> [[TMP8]], i64 [[TMP10]]
 ; TF-SCALABLE-NEXT:    store i32 [[TMP14]], ptr [[Q]], align 4, !alias.scope [[META13:![0-9]+]], !noalias [[META10]]
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
index 7737214003080..0ecb9f53eb3fc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
@@ -7,15 +7,41 @@ target triple = "riscv64-unknown-unknown-elf"
 define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) {
 ; CHECK-LABEL: define void @vf_will_not_generate_any_vector_insts(
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 1 x i32> poison, i32 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 1 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 100, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true)
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> [[BROADCAST_SPLAT3]], <vscale x 1 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP5]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[TMP3:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[SRC]], align 4
 ; CHECK-NEXT:    store i32 [[DOTPRE]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[TMP3]] = add nuw i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[TMP3]], 100
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -33,3 +59,14 @@ loop:
 exit:
   ret void
 }
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; CHECK: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-print-after-all.ll b/llvm/test/Transforms/LoopVectorize/vplan-print-after-all.ll
index 9f0a528bfa889..09c2ea6c15e79 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-print-after-all.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-print-after-all.ll
@@ -32,6 +32,7 @@
 ; CHECK: VPlan after mergeBlocksIntoPredecessors
 ; CHECK: VPlan after licm
 ; CHECK: VPlan after VPlanTransforms::optimize
+; CHECK: VPlan after VPlanTransforms::narrowScatters
 ; CHECK: VPlan after VPlanTransforms::materializeConstantVectorTripCount
 ; CHECK: VPlan after VPlanTransforms::unrollByUF
 ; CHECK: VPlan after VPlanTransforms::materializePacksAndUnpacks

>From fbdcd10c4fc5cd54fd9856433cfd2e4c0ce6d347 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Tue, 3 Feb 2026 22:19:09 -0800
Subject: [PATCH 3/4] Split off the cost calculation of LastActiveLane and
 ExtractLane.

Split off the cost calculation of LastActiveLane and ExtractLane to
prevent misaligned when the ::computeCost changes and reduce duplicate
codes.
---
 llvm/lib/Transforms/Vectorize/VPlanHelpers.h  | 35 +++++++++++++++++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 33 ++---------------
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 30 +++-------------
 3 files changed, 42 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 26e4d31696f8a..62e48e0234635 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/Support/InstructionCost.h"
 
@@ -383,6 +384,40 @@ struct VPCostContext {
       Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
       TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None,
       bool AlwaysIncludeReplicatingR = false);
+
+  /// Compute the cost of LastActiveLane for a predicate with the given scalar
+  /// type. LastActiveLane computes the index of the last active lane in a
+  /// predicate mask: NOT + cttz_elts + SUB.
+  InstructionCost getLastActiveLaneCost(Type *PredScalarTy, ElementCount VF) {
+    if (VF.isScalar())
+      return TTI.getCmpSelInstrCost(Instruction::ICmp, PredScalarTy,
+                                    CmpInst::makeCmpResultType(PredScalarTy),
+                                    CmpInst::ICMP_EQ, CostKind);
+    auto *PredTy = VectorType::get(PredScalarTy, VF);
+    IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
+                                  Type::getInt64Ty(LLVMCtx),
+                                  {PredTy, Type::getInt1Ty(LLVMCtx)});
+    InstructionCost Cost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
+    // Add cost of NOT operation on the predicate.
+    Cost += TTI.getArithmeticInstrCost(
+        Instruction::Xor, PredTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        {TargetTransformInfo::OK_UniformConstantValue,
+         TargetTransformInfo::OP_None});
+    // Add cost of SUB operation on the index.
+    Cost += TTI.getArithmeticInstrCost(Instruction::Sub,
+                                       Type::getInt64Ty(LLVMCtx), CostKind);
+    return Cost;
+  }
+
+  /// Compute the cost of ExtractLane for a vector with the given scalar element
+  /// type. ExtractLane extracts an element at a runtime-determined index.
+  InstructionCost getExtractLaneCost(Type *ValTy, ElementCount VF) {
+    if (VF.isScalar())
+      return 0;
+    auto *VecTy = VectorType::get(ValTy, VF);
+    return TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind);
+  }
 };
 
 /// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index fa395e7d07531..acdf96dd1af63 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1132,16 +1132,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
   }
   case Instruction::ExtractElement:
   case VPInstruction::ExtractLane: {
-    if (VF.isScalar()) {
-      // ExtractLane with VF=1 takes care of handling extracting across multiple
-      // parts.
-      return 0;
-    }
-
-    // Add on the cost of extracting the element.
-    auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
-    return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
-                                      Ctx.CostKind);
+    Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
+    return Ctx.getExtractLaneCost(ScalarTy, VF);
   }
   case VPInstruction::AnyOf: {
     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
@@ -1163,26 +1155,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
   }
   case VPInstruction::LastActiveLane: {
     Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
-    if (VF.isScalar())
-      return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
-                                        CmpInst::makeCmpResultType(ScalarTy),
-                                        CmpInst::ICMP_EQ, Ctx.CostKind);
-    // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
-    auto *PredTy = toVectorTy(ScalarTy, VF);
-    IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
-                                  Type::getInt64Ty(Ctx.LLVMCtx),
-                                  {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
-    InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
-    // Add cost of NOT operation on the predicate.
-    Cost += Ctx.TTI.getArithmeticInstrCost(
-        Instruction::Xor, PredTy, Ctx.CostKind,
-        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-        {TargetTransformInfo::OK_UniformConstantValue,
-         TargetTransformInfo::OP_None});
-    // Add cost of SUB operation on the index.
-    Cost += Ctx.TTI.getArithmeticInstrCost(
-        Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind);
-    return Cost;
+    return Ctx.getLastActiveLaneCost(ScalarTy, VF);
   }
   case VPInstruction::ExtractLastActive: {
     Type *ScalarTy = Ctx.Types.inferScalarType(this);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a4189afcfc6a2..65a9b96fa49cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1643,36 +1643,14 @@ void VPlanTransforms::narrowScatters(VPlan &Plan, VPCostContext &Ctx,
                         WidenStoreR->computeCost(VF, Ctx);
                     // ConvertToScalarCost = LastActiveLane + ExtractElement +
                     // scalar store.
-                    // TODO: Automatically sync with VPInstruction::computeCost.
-                    // LastActiveLane = not + cttz.elts + sub
                     InstructionCost ScalarCost = 0;
                     auto *ValTy = Ctx.Types.inferScalarType(
                         WidenStoreR->getStoredValue());
 
-                    if (!FoldTailWithEVL) {
-                      auto *PredTy =
-                          toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
-                      IntrinsicCostAttributes Attrs(
-                          Intrinsic::experimental_cttz_elts,
-                          Type::getInt64Ty(Ctx.LLVMCtx),
-                          {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
-                      ScalarCost +=
-                          Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
-                      ScalarCost += Ctx.TTI.getArithmeticInstrCost(
-                          Instruction::Xor, PredTy, Ctx.CostKind,
-                          {TargetTransformInfo::OK_AnyValue,
-                           TargetTransformInfo::OP_None},
-                          {TargetTransformInfo::OK_UniformConstantValue,
-                           TargetTransformInfo::OP_None});
-                    }
-                    // ExtractElement cost
-                    auto *VecTy = toVectorTy(ValTy, VF);
-                    ScalarCost += Ctx.TTI.getVectorInstrCost(
-                        Instruction::ExtractElement, VecTy, Ctx.CostKind);
-
-                    ScalarCost += Ctx.TTI.getArithmeticInstrCost(
-                        Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx),
-                        Ctx.CostKind);
+                    if (!FoldTailWithEVL)
+                      ScalarCost += Ctx.getLastActiveLaneCost(
+                          Type::getInt1Ty(Ctx.LLVMCtx), VF);
+                    ScalarCost += Ctx.getExtractLaneCost(ValTy, VF);
 
                     // Scalar store cost
                     Instruction &I = WidenStoreR->getIngredient();

>From c9873552b587fefa0c2f918d390d184940993c90 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Mon, 9 Feb 2026 22:03:35 -0800
Subject: [PATCH 4/4] Introduce getCostForOpcodeAndTypes and address comments.

---
 llvm/lib/Transforms/Vectorize/VPlan.h         |   8 +
 llvm/lib/Transforms/Vectorize/VPlanHelpers.h  |  35 ----
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  46 ++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 165 +++++++-----------
 4 files changed, 118 insertions(+), 136 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a5f314ac188d8..95b02aa1e69b4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1080,6 +1080,14 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
   /// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx.
   InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF,
                                              VPCostContext &Ctx) const;
+
+  /// Compute the cost for a recipe with \p VF using \p Opcode, \p RetTy and \p
+  /// ArgTys. This function may not be as accurate as
+  /// `getCostForRecipeWithOpcode` since it only provides type-based queries.
+  static InstructionCost getCostForRecipeWithOpcodeAndTypes(unsigned Opcode,
+                                                            Type *RetTy,
+                                                            ElementCount VF,
+                                                            VPCostContext &Ctx);
 };
 
 /// Helper to access the operand that contains the unroll part for this recipe
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 62e48e0234635..26e4d31696f8a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -23,7 +23,6 @@
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/Support/InstructionCost.h"
 
@@ -384,40 +383,6 @@ struct VPCostContext {
       Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
       TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None,
       bool AlwaysIncludeReplicatingR = false);
-
-  /// Compute the cost of LastActiveLane for a predicate with the given scalar
-  /// type. LastActiveLane computes the index of the last active lane in a
-  /// predicate mask: NOT + cttz_elts + SUB.
-  InstructionCost getLastActiveLaneCost(Type *PredScalarTy, ElementCount VF) {
-    if (VF.isScalar())
-      return TTI.getCmpSelInstrCost(Instruction::ICmp, PredScalarTy,
-                                    CmpInst::makeCmpResultType(PredScalarTy),
-                                    CmpInst::ICMP_EQ, CostKind);
-    auto *PredTy = VectorType::get(PredScalarTy, VF);
-    IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
-                                  Type::getInt64Ty(LLVMCtx),
-                                  {PredTy, Type::getInt1Ty(LLVMCtx)});
-    InstructionCost Cost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
-    // Add cost of NOT operation on the predicate.
-    Cost += TTI.getArithmeticInstrCost(
-        Instruction::Xor, PredTy, CostKind,
-        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-        {TargetTransformInfo::OK_UniformConstantValue,
-         TargetTransformInfo::OP_None});
-    // Add cost of SUB operation on the index.
-    Cost += TTI.getArithmeticInstrCost(Instruction::Sub,
-                                       Type::getInt64Ty(LLVMCtx), CostKind);
-    return Cost;
-  }
-
-  /// Compute the cost of ExtractLane for a vector with the given scalar element
-  /// type. ExtractLane extracts an element at a runtime-determined index.
-  InstructionCost getExtractLaneCost(Type *ValTy, ElementCount VF) {
-    if (VF.isScalar())
-      return 0;
-    auto *VecTy = VectorType::get(ValTy, VF);
-    return TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind);
-  }
 };
 
 /// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index acdf96dd1af63..bffe2ff1bcf65 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -903,6 +903,47 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
 }
 
+InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcodeAndTypes(
+    unsigned Opcode, Type *RetTy, ElementCount VF, VPCostContext &Ctx) {
+  switch (Opcode) {
+  case VPInstruction::LastActiveLane: {
+    // LastActiveLane computes the index of the last active lane in a
+    // predicate mask: NOT + cttz_elts + SUB.
+    if (VF.isScalar())
+      return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, RetTy,
+                                        CmpInst::makeCmpResultType(RetTy),
+                                        CmpInst::ICMP_EQ, Ctx.CostKind);
+    auto *PredTy = toVectorTy(RetTy, VF);
+    IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
+                                  Type::getInt64Ty(Ctx.LLVMCtx),
+                                  {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
+    InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
+    // Add cost of NOT operation on the predicate.
+    Cost += Ctx.TTI.getArithmeticInstrCost(
+        Instruction::Xor, PredTy, Ctx.CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        {TargetTransformInfo::OK_UniformConstantValue,
+         TargetTransformInfo::OP_None});
+    // Add cost of SUB operation on the index.
+    Cost += Ctx.TTI.getArithmeticInstrCost(
+        Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind);
+    return Cost;
+  }
+  case VPInstruction::ExtractLane: {
+    // Compute the cost of ExtractLane for a vector with the given scalar
+    // element type. ExtractLane extracts an element at a runtime-determined
+    // index.
+    if (VF.isScalar())
+      return 0;
+    auto *VecTy = toVectorTy(RetTy, VF);
+    return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
+                                      Ctx.CostKind);
+  }
+  default:
+    llvm_unreachable("Unsupported opcode");
+  }
+}
+
 InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
     unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
   Type *ScalarTy = Ctx.Types.inferScalarType(this);
@@ -1133,7 +1174,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
   case Instruction::ExtractElement:
   case VPInstruction::ExtractLane: {
     Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
-    return Ctx.getExtractLaneCost(ScalarTy, VF);
+    return getCostForRecipeWithOpcodeAndTypes(getOpcode(), ScalarTy, VF, Ctx);
   }
   case VPInstruction::AnyOf: {
     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
@@ -1155,7 +1196,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
   }
   case VPInstruction::LastActiveLane: {
     Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
-    return Ctx.getLastActiveLaneCost(ScalarTy, VF);
+    return getCostForRecipeWithOpcodeAndTypes(VPInstruction::LastActiveLane,
+                                              ScalarTy, VF, Ctx);
   }
   case VPInstruction::ExtractLastActive: {
     Type *ScalarTy = Ctx.Types.inferScalarType(this);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 65a9b96fa49cc..457ac7cf7c975 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1611,67 +1611,77 @@ void VPlanTransforms::narrowScatters(VPlan &Plan, VPCostContext &Ctx,
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
-      if (!isa<VPWidenStoreRecipe>(&R))
-        continue;
       // Convert an unmasked or header masked scatter with an uniform address
       // into extract-last-lane + scalar store.
-      // TODO: Add a profitability check comparing the cost of a scatter vs.
-      // extract + scalar store.
       auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
-      if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
-          !WidenStoreR->isConsecutive()) {
-        assert(!WidenStoreR->isReverse() &&
-               "Not consecutive memory recipes shouldn't be reversed");
-        VPValue *Mask = WidenStoreR->getMask();
-
-        // Convert the scatter to a scalar store if it is unmasked or header
-        // masked.
-        if (Mask && !vputils::isHeaderMask(Mask, Plan))
-          continue;
+      if (!WidenStoreR || !vputils::isSingleScalar(WidenStoreR->getAddr()) ||
+          WidenStoreR->isConsecutive())
+        continue;
+      assert(!WidenStoreR->isReverse() &&
+             "Not consecutive memory recipes shouldn't be reversed");
+      VPValue *Mask = WidenStoreR->getMask();
 
-        VPInstruction *Extract;
-        if (!Mask) {
-          Extract = new VPInstruction(VPInstruction::ExtractLastLane,
-                                      {WidenStoreR->getOperand(1)});
-        } else {
-          // If the mask is the header mask, this mask contains at least one
-          // active lane. So it is safe to convert the scatter to a scalar
-          // store.
-          if (!LoopVectorizationPlanner::getDecisionAndClampRange(
-                  [&](ElementCount VF) {
-                    InstructionCost ScatterCost =
-                        WidenStoreR->computeCost(VF, Ctx);
-                    // ConvertToScalarCost = LastActiveLane + ExtractElement +
-                    // scalar store.
-                    InstructionCost ScalarCost = 0;
-                    auto *ValTy = Ctx.Types.inferScalarType(
-                        WidenStoreR->getStoredValue());
-
-                    if (!FoldTailWithEVL)
-                      ScalarCost += Ctx.getLastActiveLaneCost(
-                          Type::getInt1Ty(Ctx.LLVMCtx), VF);
-                    ScalarCost += Ctx.getExtractLaneCost(ValTy, VF);
-
-                    // Scalar store cost
-                    Instruction &I = WidenStoreR->getIngredient();
-                    unsigned AS = getLoadStoreAddressSpace(&I);
-                    TTI::OperandValueInfo OpInfo =
-                        TTI::getOperandInfo(I.getOperand(0));
-                    ScalarCost += Ctx.TTI.getMemoryOpCost(
-                        Instruction::Store, ValTy, WidenStoreR->getAlign(), AS,
-                        Ctx.CostKind, OpInfo, &I);
-
-                    return ScalarCost.isValid() && ScalarCost <= ScatterCost;
-                  },
-                  Range))
-            continue;
+      // Convert the scatter to a scalar store if it is unmasked or header
+      // masked.
+      if (Mask && !vputils::isHeaderMask(Mask, Plan))
+        continue;
 
-          VPInstruction *Idx =
-              new VPInstruction(VPInstruction::LastActiveLane, Mask);
-          Idx->insertBefore(WidenStoreR);
-          Extract = new VPInstruction(VPInstruction::ExtractLane,
-                                      {Idx, WidenStoreR->getOperand(1)});
-        }
+      VPInstruction *Extract;
+      if (!Mask) {
+        Extract = new VPInstruction(VPInstruction::ExtractLastLane,
+                                    {WidenStoreR->getOperand(1)});
+      } else {
+        // If the mask is the header mask, this mask contains at least one
+        // active lane. So it is safe to convert the scatter to a scalar
+        // store. Note that this will generate LastActiveLane which can only be
+        // used on header mask.
+        if (!LoopVectorizationPlanner::getDecisionAndClampRange(
+                [&](ElementCount VF) {
+                  InstructionCost ScatterCost =
+                      WidenStoreR->computeCost(VF, Ctx);
+                  // ConvertToScalarCost = LastActiveLane + ExtractLane +
+                  // scalar store.
+                  InstructionCost ScalarCost = 0;
+                  auto *ValTy =
+                      Ctx.Types.inferScalarType(WidenStoreR->getStoredValue());
+
+                  // LastActiveLane which will lower to `EVL - 1` is cheaper
+                  // under EVL.
+                  if (FoldTailWithEVL)
+                    ScalarCost += Ctx.TTI.getArithmeticInstrCost(
+                        Instruction::Sub, Type::getInt32Ty(Ctx.LLVMCtx),
+                        Ctx.CostKind);
+                  else
+                    ScalarCost +=
+                        VPRecipeWithIRFlags::getCostForRecipeWithOpcodeAndTypes(
+                            VPInstruction::LastActiveLane,
+                            Type::getInt1Ty(Ctx.LLVMCtx), VF, Ctx);
+
+                  // ExtractLane cost.
+                  ScalarCost +=
+                      VPRecipeWithIRFlags::getCostForRecipeWithOpcodeAndTypes(
+                          VPInstruction::ExtractLane, ValTy, VF, Ctx);
+
+                  // Scalar store cost
+                  Instruction &I = WidenStoreR->getIngredient();
+                  unsigned AS = getLoadStoreAddressSpace(&I);
+                  TTI::OperandValueInfo OpInfo =
+                      TTI::getOperandInfo(I.getOperand(0));
+                  ScalarCost += Ctx.TTI.getMemoryOpCost(
+                      Instruction::Store, ValTy, WidenStoreR->getAlign(), AS,
+                      Ctx.CostKind, OpInfo, &I);
+
+                  return ScalarCost.isValid() && ScalarCost <= ScatterCost;
+                },
+                Range))
+          continue;
+
+        VPInstruction *Idx =
+            new VPInstruction(VPInstruction::LastActiveLane, Mask);
+        Idx->insertBefore(WidenStoreR);
+        Extract = new VPInstruction(VPInstruction::ExtractLane,
+                                    {Idx, WidenStoreR->getOperand(1)});
+      }
         Extract->insertBefore(WidenStoreR);
 
         // TODO: Sink the scalar store recipe to middle block if possible.
@@ -1681,7 +1691,6 @@ void VPlanTransforms::narrowScatters(VPlan &Plan, VPCostContext &Ctx,
             *WidenStoreR /*Metadata*/);
         ScalarStore->insertBefore(WidenStoreR);
         WidenStoreR->eraseFromParent();
-      }
     }
   }
 }
@@ -1703,48 +1712,6 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
         continue;
 
-      // Convert an unmasked or header masked scatter with an uniform address
-      // into extract-last-lane + scalar store.
-      // TODO: Add a profitability check comparing the cost of a scatter vs.
-      // extract + scalar store.
-      auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
-      if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
-          !WidenStoreR->isConsecutive()) {
-        assert(!WidenStoreR->isReverse() &&
-               "Not consecutive memory recipes shouldn't be reversed");
-        VPValue *Mask = WidenStoreR->getMask();
-
-        // Convert the scatter to a scalar store if it is unmasked or header
-        // masked.
-        if (Mask && !vputils::isHeaderMask(Mask, Plan))
-          continue;
-
-        VPInstruction *Extract;
-        if (!Mask) {
-          Extract = new VPInstruction(VPInstruction::ExtractLastLane,
-                                      {WidenStoreR->getOperand(1)});
-        } else {
-          // If the mask is the header mask, this mask contains at least one
-          // active lane. So it is safe to convert the scatter to a scalar
-          // store.
-          VPInstruction *Idx =
-              new VPInstruction(VPInstruction::LastActiveLane, Mask);
-          Idx->insertBefore(WidenStoreR);
-          Extract = new VPInstruction(VPInstruction::ExtractLane,
-                                      {Idx, WidenStoreR->getOperand(1)});
-        }
-        Extract->insertBefore(WidenStoreR);
-
-        // TODO: Sink the scalar store recipe to middle block if possible.
-        auto *ScalarStore = new VPReplicateRecipe(
-            &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
-            true /*IsSingleScalar*/, nullptr /*Mask*/, {},
-            *WidenStoreR /*Metadata*/);
-        ScalarStore->insertBefore(WidenStoreR);
-        WidenStoreR->eraseFromParent();
-        continue;
-      }
-
       auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
       if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
           vputils::isSingleScalar(RepR->getOperand(1))) {