[llvm] WIP:[LV] Support strided memory accesses with a stride of -1 (PR #128718)

Thu Feb 27 03:15:35 PST 2025

https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/128718

>From 098b28f61312a0ad3a3e734c4335ee7eea4d1b54 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 25 Feb 2025 00:31:28 -0800
Subject: [PATCH 1/8] Pre-commit for cm_stride

---
 .../RISCV/riscv-vector-reverse-output.ll      | 417 ++++++++++++++++++
 1 file changed, 417 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
new file mode 100644
index 0000000000000..2b8ac0ab685ab
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -0,0 +1,417 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+;; This is the loop in c++ being vectorize in this file with
+;; vector.reverse
+;;  #pragma clang loop vectorize_width(4, scalable)
+;;  for (int i = N-1; i >= 0; --i)
+;;    a[i] = b[i] + 1.0;
+
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN: -scalable-vectorization=on -riscv-v-vector-bits-min=128 -S < %s \
+; RUN: | FileCheck --check-prefix=RV64 %s
+
+; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v \
+; RUN: -scalable-vectorization=on -riscv-v-vector-bits-min=128 -S < %s \
+; RUN: | FileCheck --check-prefix=RV32 %s
+
+define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
+; RV64-LABEL: define void @vector_reverse_i64(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV64-NEXT:  [[ENTRY:.*:]]
+; RV64-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64:       [[FOR_BODY_PREHEADER]]:
+; RV64-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64:       [[VECTOR_SCEVCHECK]]:
+; RV64-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-NEXT:    [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-NEXT:    br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64:       [[VECTOR_PH]]:
+; RV64-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]]
+; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; RV64-NEXT:    [[TMP15:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-NEXT:    [[TMP16:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64:       [[VECTOR_BODY]]:
+; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV64-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV64-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1
+; RV64-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+; RV64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP19]]
+; RV64-NEXT:    [[TMP21:%.*]] = mul i64 0, [[TMP14]]
+; RV64-NEXT:    [[TMP22:%.*]] = sub i64 1, [[TMP14]]
+; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 [[TMP21]]
+; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]]
+; RV64-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP24]], align 4
+; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-NEXT:    [[TMP25:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
+; RV64-NEXT:    [[TMP27:%.*]] = mul i64 0, [[TMP14]]
+; RV64-NEXT:    [[TMP28:%.*]] = sub i64 1, [[TMP14]]
+; RV64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP27]]
+; RV64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i64 [[TMP28]]
+; RV64-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP25]])
+; RV64-NEXT:    store <vscale x 4 x i32> [[REVERSE2]], ptr [[TMP30]], align 4
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; RV64-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64:       [[MIDDLE_BLOCK]]:
+; RV64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64:       [[SCALAR_PH]]:
+; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; RV64:       [[FOR_COND_CLEANUP]]:
+; RV64-NEXT:    ret void
+; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; RV64-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
+; RV64-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; RV64-NEXT:    [[ADD9:%.*]] = add i32 [[TMP32]], 1
+; RV64-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]]
+; RV64-NEXT:    store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4
+; RV64-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; RV64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+; RV32-LABEL: define void @vector_reverse_i64(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV32-NEXT:  [[ENTRY:.*:]]
+; RV32-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV32-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV32:       [[FOR_BODY_PREHEADER]]:
+; RV32-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32:       [[VECTOR_PH]]:
+; RV32-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV32-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; RV32-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV32-NEXT:    [[TMP8:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV32:       [[VECTOR_BODY]]:
+; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV32-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV32-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
+; RV32-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; RV32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]]
+; RV32-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32
+; RV32-NEXT:    [[TMP14:%.*]] = mul i32 0, [[TMP13]]
+; RV32-NEXT:    [[TMP15:%.*]] = sub i32 1, [[TMP13]]
+; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[TMP14]]
+; RV32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP15]]
+; RV32-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4
+; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV32-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; RV32-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP6]] to i32
+; RV32-NEXT:    [[TMP21:%.*]] = mul i32 0, [[TMP20]]
+; RV32-NEXT:    [[TMP22:%.*]] = sub i32 1, [[TMP20]]
+; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 [[TMP21]]
+; RV32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 [[TMP22]]
+; RV32-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; RV32-NEXT:    store <vscale x 4 x i32> [[REVERSE2]], ptr [[TMP24]], align 4
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV32-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV32:       [[MIDDLE_BLOCK]]:
+; RV32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV32:       [[SCALAR_PH]]:
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT:    br label %[[FOR_BODY:.*]]
+; RV32:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV32-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; RV32:       [[FOR_COND_CLEANUP]]:
+; RV32-NEXT:    ret void
+; RV32:       [[FOR_BODY]]:
+; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; RV32-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; RV32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
+; RV32-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; RV32-NEXT:    [[ADD9:%.*]] = add i32 [[TMP26]], 1
+; RV32-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]]
+; RV32-NEXT:    store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4
+; RV32-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; RV32-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+  %i.0 = add nsw i32 %i.0.in8, -1
+  %idxprom = zext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+  %1 = load i32, ptr %arrayidx, align 4
+  %add9 = add i32 %1, 1
+  %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+  store i32 %add9, ptr %arrayidx3, align 4
+  %cmp = icmp ugt i64 %indvars.iv, 1
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
+; RV64-LABEL: define void @vector_reverse_f32(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*:]]
+; RV64-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64:       [[FOR_BODY_PREHEADER]]:
+; RV64-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64:       [[VECTOR_SCEVCHECK]]:
+; RV64-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-NEXT:    [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-NEXT:    br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64:       [[VECTOR_PH]]:
+; RV64-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]]
+; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; RV64-NEXT:    [[TMP15:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-NEXT:    [[TMP16:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64:       [[VECTOR_BODY]]:
+; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV64-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV64-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1
+; RV64-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+; RV64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
+; RV64-NEXT:    [[TMP21:%.*]] = mul i64 0, [[TMP14]]
+; RV64-NEXT:    [[TMP22:%.*]] = sub i64 1, [[TMP14]]
+; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP22]]
+; RV64-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP24]], align 4
+; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-NEXT:    [[TMP25:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
+; RV64-NEXT:    [[TMP27:%.*]] = mul i64 0, [[TMP14]]
+; RV64-NEXT:    [[TMP28:%.*]] = sub i64 1, [[TMP14]]
+; RV64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP27]]
+; RV64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP28]]
+; RV64-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP25]])
+; RV64-NEXT:    store <vscale x 4 x float> [[REVERSE2]], ptr [[TMP30]], align 4
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; RV64-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; RV64:       [[MIDDLE_BLOCK]]:
+; RV64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64:       [[SCALAR_PH]]:
+; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; RV64:       [[FOR_COND_CLEANUP]]:
+; RV64-NEXT:    ret void
+; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; RV64-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
+; RV64-NEXT:    [[TMP32:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; RV64-NEXT:    [[CONV1:%.*]] = fadd float [[TMP32]], 1.000000e+00
+; RV64-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]]
+; RV64-NEXT:    store float [[CONV1]], ptr [[ARRAYIDX3]], align 4
+; RV64-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; RV64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; RV32-LABEL: define void @vector_reverse_f32(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; RV32-NEXT:  [[ENTRY:.*:]]
+; RV32-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV32-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV32:       [[FOR_BODY_PREHEADER]]:
+; RV32-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32:       [[VECTOR_PH]]:
+; RV32-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV32-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; RV32-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV32-NEXT:    [[TMP8:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV32:       [[VECTOR_BODY]]:
+; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV32-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV32-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
+; RV32-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; RV32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
+; RV32-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32
+; RV32-NEXT:    [[TMP14:%.*]] = mul i32 0, [[TMP13]]
+; RV32-NEXT:    [[TMP15:%.*]] = sub i32 1, [[TMP13]]
+; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP14]]
+; RV32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP15]]
+; RV32-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
+; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV32-NEXT:    [[TMP18:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
+; RV32-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP6]] to i32
+; RV32-NEXT:    [[TMP21:%.*]] = mul i32 0, [[TMP20]]
+; RV32-NEXT:    [[TMP22:%.*]] = sub i32 1, [[TMP20]]
+; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 [[TMP21]]
+; RV32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP22]]
+; RV32-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP18]])
+; RV32-NEXT:    store <vscale x 4 x float> [[REVERSE2]], ptr [[TMP24]], align 4
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV32-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; RV32:       [[MIDDLE_BLOCK]]:
+; RV32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV32:       [[SCALAR_PH]]:
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT:    br label %[[FOR_BODY:.*]]
+; RV32:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV32-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; RV32:       [[FOR_COND_CLEANUP]]:
+; RV32-NEXT:    ret void
+; RV32:       [[FOR_BODY]]:
+; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; RV32-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; RV32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
+; RV32-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; RV32-NEXT:    [[CONV1:%.*]] = fadd float [[TMP26]], 1.000000e+00
+; RV32-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]]
+; RV32-NEXT:    store float [[CONV1]], ptr [[ARRAYIDX3]], align 4
+; RV32-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; RV32-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+  %i.0 = add nsw i32 %i.0.in8, -1
+  %idxprom = zext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+  %1 = load float, ptr %arrayidx, align 4
+  %conv1 = fadd float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+  store float %conv1, ptr %arrayidx3, align 4
+  %cmp = icmp ugt i64 %indvars.iv, 1
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+;.
+; RV64: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; RV64: [[META1]] = !{!"llvm.loop.mustprogress"}
+; RV64: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; RV64: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; RV64: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; RV64: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
+; RV64: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+;.
+; RV32: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; RV32: [[META1]] = !{!"llvm.loop.mustprogress"}
+; RV32: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; RV32: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; RV32: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META3]], [[META2]]}
+; RV32: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
+; RV32: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META3]], [[META2]]}
+;.

>From 8886947a54f7237b378f9c4dd1b0c155404730fd Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 25 Feb 2025 05:14:10 -0800
Subject: [PATCH 2/8] UF2 test

---
 .../RISCV/riscv-vector-reverse-output.ll      | 221 +++++++++++++++++-
 1 file changed, 212 insertions(+), 9 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index 2b8ac0ab685ab..a15a8342154ff 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -7,13 +7,17 @@
 ;;    a[i] = b[i] + 1.0;
 
 ; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
-; RUN: -scalable-vectorization=on -riscv-v-vector-bits-min=128 -S < %s \
+; RUN: -riscv-v-vector-bits-min=128 -S < %s \
 ; RUN: | FileCheck --check-prefix=RV64 %s
 
 ; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v \
-; RUN: -scalable-vectorization=on -riscv-v-vector-bits-min=128 -S < %s \
+; RUN: -riscv-v-vector-bits-min=128 -S < %s \
 ; RUN: | FileCheck --check-prefix=RV32 %s
 
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN: -riscv-v-vector-bits-min=128 -force-vector-interleave=2 -S < %s \
+; RUN: | FileCheck --check-prefix=RV64-UF2 %s
+
 define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-LABEL: define void @vector_reverse_i64(
 ; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -174,6 +178,105 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV32-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
 ; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
 ;
+; RV64-UF2-LABEL: define void @vector_reverse_i64(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV64-UF2-NEXT:  [[ENTRY:.*:]]
+; RV64-UF2-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-UF2-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64-UF2:       [[FOR_BODY_PREHEADER]]:
+; RV64-UF2-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64-UF2:       [[VECTOR_SCEVCHECK]]:
+; RV64-UF2-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-UF2-NEXT:    [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-UF2-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-UF2-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-UF2-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-UF2-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-UF2-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-UF2-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-UF2-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-UF2-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-UF2-NEXT:    br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64-UF2:       [[VECTOR_PH]]:
+; RV64-UF2-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; RV64-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]]
+; RV64-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-UF2-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64-UF2:       [[VECTOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = add nsw i32 [[TMP18]], -1
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP20]]
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
+; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP25]], align 4
+; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-UF2-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[TMP29]], align 4
+; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD2]])
+; RV64-UF2-NEXT:    [[TMP30:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-UF2-NEXT:    [[TMP31:%.*]] = add <vscale x 4 x i32> [[REVERSE3]], splat (i32 1)
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = mul i64 0, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP33]]
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP35]], i64 [[TMP34]]
+; RV64-UF2-NEXT:    [[TMP37:%.*]] = mul i64 -1, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP38:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP37]]
+; RV64-UF2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i64 [[TMP38]]
+; RV64-UF2-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP30]])
+; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP36]], align 4
+; RV64-UF2-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP31]])
+; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE5]], ptr [[TMP40]], align 4
+; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; RV64-UF2-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64-UF2:       [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64-UF2:       [[SCALAR_PH]]:
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64-UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; RV64-UF2:       [[FOR_COND_CLEANUP]]:
+; RV64-UF2-NEXT:    ret void
+; RV64-UF2:       [[FOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; RV64-UF2-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; RV64-UF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
+; RV64-UF2-NEXT:    [[TMP42:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; RV64-UF2-NEXT:    [[ADD9:%.*]] = add i32 [[TMP42]], 1
+; RV64-UF2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]]
+; RV64-UF2-NEXT:    store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4
+; RV64-UF2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; RV64-UF2-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; RV64-UF2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+;
 entry:
   %cmp7 = icmp sgt i32 %n, 0
   br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
@@ -360,6 +463,105 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV32-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
 ; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
 ;
+; RV64-UF2-LABEL: define void @vector_reverse_f32(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT:  [[ENTRY:.*:]]
+; RV64-UF2-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-UF2-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64-UF2:       [[FOR_BODY_PREHEADER]]:
+; RV64-UF2-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64-UF2:       [[VECTOR_SCEVCHECK]]:
+; RV64-UF2-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-UF2-NEXT:    [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-UF2-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-UF2-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-UF2-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-UF2-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-UF2-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-UF2-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-UF2-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-UF2-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-UF2-NEXT:    br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64-UF2:       [[VECTOR_PH]]:
+; RV64-UF2-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; RV64-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]]
+; RV64-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-UF2-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64-UF2:       [[VECTOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = add nsw i32 [[TMP18]], -1
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP20]]
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
+; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP25]], align 4
+; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-UF2-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP29]], align 4
+; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD2]])
+; RV64-UF2-NEXT:    [[TMP30:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT:    [[TMP31:%.*]] = fadd <vscale x 4 x float> [[REVERSE3]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]]
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = mul i64 0, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP33]]
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP35]], i64 [[TMP34]]
+; RV64-UF2-NEXT:    [[TMP37:%.*]] = mul i64 -1, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP38:%.*]] = sub i64 1, [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP37]]
+; RV64-UF2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP39]], i64 [[TMP38]]
+; RV64-UF2-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP30]])
+; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP36]], align 4
+; RV64-UF2-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP31]])
+; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE5]], ptr [[TMP40]], align 4
+; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; RV64-UF2-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; RV64-UF2:       [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64-UF2:       [[SCALAR_PH]]:
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64-UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; RV64-UF2:       [[FOR_COND_CLEANUP]]:
+; RV64-UF2-NEXT:    ret void
+; RV64-UF2:       [[FOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
+; RV64-UF2-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; RV64-UF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
+; RV64-UF2-NEXT:    [[TMP42:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; RV64-UF2-NEXT:    [[CONV1:%.*]] = fadd float [[TMP42]], 1.000000e+00
+; RV64-UF2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]]
+; RV64-UF2-NEXT:    store float [[CONV1]], ptr [[ARRAYIDX3]], align 4
+; RV64-UF2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1
+; RV64-UF2-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; RV64-UF2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+;
 entry:
   %cmp7 = icmp sgt i32 %n, 0
   br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
@@ -391,13 +593,6 @@ for.body:                                         ; preds = %for.body.preheader,
 !2 = !{!"llvm.loop.vectorize.width", i32 4}
 !3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
 !4 = !{!"llvm.loop.vectorize.enable", i1 true}
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
-; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
 ;.
 ; RV64: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
 ; RV64: [[META1]] = !{!"llvm.loop.mustprogress"}
@@ -415,3 +610,11 @@ for.body:                                         ; preds = %for.body.preheader,
 ; RV32: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
 ; RV32: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META3]], [[META2]]}
 ;.
+; RV64-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; RV64-UF2: [[META1]] = !{!"llvm.loop.mustprogress"}
+; RV64-UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; RV64-UF2: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; RV64-UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; RV64-UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
+; RV64-UF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+;.

>From 15c8f0ba286a65760d4cb478021512b747a3feb0 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 19 Feb 2025 01:37:53 -0800
Subject: [PATCH 3/8] Step 1: Patch legal check for strided accesses

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 226fc23888f02..5948fd6fcb132 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1308,6 +1308,18 @@ class LoopVectorizationCostModel {
            (SI && TTI.isLegalMaskedScatter(Ty, Align));
   }
 
+  /// Returns true if the target machine can represent \p V as a strided load
+  /// or store operation.
+  bool isLegalStridedLoadStore(Value *V, ElementCount VF) {
+    if (!isa<LoadInst, StoreInst>(V))
+      return false;
+    auto *Ty = getLoadStoreType(V);
+    Align Align = getLoadStoreAlignment(V);
+    if (VF.isVector())
+      Ty = VectorType::get(Ty, VF);
+    return TTI.isLegalStridedLoadStore(Ty, Align);
+  }
+
   /// Returns true if the target machine supports all of the reduction
   /// variables found for the given VF.
   bool canVectorizeReductions(ElementCount VF) const {

>From 1ae1c1ff4665b8fa57f801a899434930a0e550d0 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 19 Feb 2025 01:38:46 -0800
Subject: [PATCH 4/8] Step 2: Patch cost for strided accesses

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5948fd6fcb132..f8e8822edf0dc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1657,6 +1657,9 @@ class LoopVectorizationCostModel {
   /// element)
   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
 
+  /// The cost computation for strided load/store instruction.
+  InstructionCost getStridedLoadStoreCost(Instruction *I, ElementCount VF);
+
   /// Estimate the overhead of scalarizing an instruction. This is a
   /// convenience wrapper for the type-based getScalarizationOverhead API.
   InstructionCost getScalarizationOverhead(Instruction *I,
@@ -5825,6 +5828,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   return Cost;
 }
 
+InstructionCost
+LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
+                                                    ElementCount VF) {
+  Type *ValTy = getLoadStoreType(I);
+  auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+  const Align Alignment = getLoadStoreAlignment(I);
+  const Value *Ptr = getLoadStorePointerOperand(I);
+
+  return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
+                                    Legal->isMaskRequired(I), Alignment,
+                                    CostKind, I);
+}
+
 std::optional<InstructionCost>
 LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
                                                     ElementCount VF,

>From 4fcaa5914567ed8fb85860cb75aea7bce38d22fa Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 20 Feb 2025 06:09:52 -0800
Subject: [PATCH 5/8] Step 3: Patch the implementation of load/store recipes

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   4 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  29 +++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 113 ++++++++++++++----
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   4 +-
 .../Transforms/Vectorize/VPlanTest.cpp        |   7 +-
 5 files changed, 118 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f8e8822edf0dc..eb468545d2235 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8382,12 +8382,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
     Ptr = VectorPtr;
   }
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
-    return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
+    return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, false,
                                  I->getDebugLoc());
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
-                                Reverse, I->getDebugLoc());
+                                Reverse, false, I->getDebugLoc());
 }
 
 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1f1af7f87e554..266f070b91a7b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2560,6 +2560,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
   /// Whether the consecutive accessed addresses are in reverse order.
   bool Reverse;
 
+  /// Whether the accessed addresses are evenly spaced apart by a fixed stride.
+  bool Strided;
+
   /// Whether the memory access is masked.
   bool IsMasked = false;
 
@@ -2573,9 +2576,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
 
   VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
                       std::initializer_list<VPValue *> Operands,
-                      bool Consecutive, bool Reverse, DebugLoc DL)
+                      bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
       : VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive),
-        Reverse(Reverse) {
+        Reverse(Reverse), Strided(Strided) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
   }
 
@@ -2603,6 +2606,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
   /// order.
   bool isReverse() const { return Reverse; }
 
+  /// Return whether the accessed addresses are evenly spaced apart by a fixed
+  /// stride.
+  bool isStrided() const { return Strided; }
+
   /// Return the address accessed by this recipe.
   VPValue *getAddr() const { return getOperand(0); }
 
@@ -2632,16 +2639,16 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
 /// optional mask.
 struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
   VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
-                    bool Consecutive, bool Reverse, DebugLoc DL)
+                    bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
-                            Reverse, DL),
+                            Reverse, Strided, DL),
         VPValue(this, &Load) {
     setMask(Mask);
   }
 
   VPWidenLoadRecipe *clone() override {
     return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
-                                 getMask(), Consecutive, Reverse,
+                                 getMask(), Consecutive, Reverse, Strided,
                                  getDebugLoc());
   }
 
@@ -2673,7 +2680,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
                             {L.getAddr(), &EVL}, L.isConsecutive(),
-                            L.isReverse(), L.getDebugLoc()),
+                            L.isReverse(), L.isStrided(), L.getDebugLoc()),
         VPValue(this, &getIngredient()) {
     setMask(Mask);
   }
@@ -2710,16 +2717,17 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
 /// to store to and an optional mask.
 struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
   VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
-                     VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL)
+                     VPValue *Mask, bool Consecutive, bool Reverse,
+                     bool Strided, DebugLoc DL)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
-                            Consecutive, Reverse, DL) {
+                            Consecutive, Reverse, Strided, DL) {
     setMask(Mask);
   }
 
   VPWidenStoreRecipe *clone() override {
     return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
                                   getStoredValue(), getMask(), Consecutive,
-                                  Reverse, getDebugLoc());
+                                  Reverse, Strided, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -2753,7 +2761,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
                             {S.getAddr(), S.getStoredValue(), &EVL},
-                            S.isConsecutive(), S.isReverse(), S.getDebugLoc()) {
+                            S.isConsecutive(), S.isReverse(), S.isStrided(),
+                            S.getDebugLoc()) {
     setMask(Mask);
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8a928fa71fab8..bf2e4867b56d1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2561,10 +2561,15 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
     assert(!Reverse &&
            "Inconsecutive memory access should not have the order.");
-    return Ctx.TTI.getAddressComputationCost(Ty) +
-           Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
-                                          IsMasked, Alignment, Ctx.CostKind,
-                                          &Ingredient);
+    if (Strided)
+      return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
+                                            IsMasked, Alignment, Ctx.CostKind,
+                                            &Ingredient);
+    else
+      return Ctx.TTI.getAddressComputationCost(Ty) +
+             Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
+                                            IsMasked, Alignment, Ctx.CostKind,
+                                            &Ingredient);
   }
 
   InstructionCost Cost = 0;
@@ -2591,11 +2596,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
-  bool CreateGather = !isConsecutive();
+  bool CreateGather = !isConsecutive() && !isStrided();
 
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
-  Value *Mask = nullptr;
+  Value *Mask = isStrided()
+                    ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
+                    : nullptr;
   if (auto *VPMask = getMask()) {
     // Mask reversal is only needed for non-all-one (null) masks, as reverse
     // of a null all-one mask is a null mask.
@@ -2610,9 +2617,25 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
                                        "wide.masked.gather");
   } else if (Mask) {
-    NewLI =
-        Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
-                                 PoisonValue::get(DataTy), "wide.masked.load");
+    if (isStrided()) {
+      const DataLayout &DL = LI->getDataLayout();
+      auto *PtrTy = Addr->getType();
+      auto *StrideTy = DL.getIndexType(PtrTy);
+      // TODO: Support non-unit-reverse strided accesses.
+      auto *StrideVal =
+          ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
+      Value *RuntimeVF =
+          getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+      NewLI = Builder.CreateIntrinsic(
+          Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+          {Addr, StrideVal, Mask, RuntimeVF}, nullptr, "wide.strided.load");
+      cast<CallInst>(NewLI)->addParamAttr(
+          0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+    } else {
+      NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
+                                       PoisonValue::get(DataTy),
+                                       "wide.masked.load");
+    }
   } else {
     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
   }
@@ -2650,7 +2673,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
-  bool CreateGather = !isConsecutive();
+  bool CreateGather = !isConsecutive() && !isStrided();
 
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
@@ -2670,6 +2693,16 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
     NewLI =
         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
                                 nullptr, "wide.masked.gather");
+  } else if (isStrided()) {
+    const DataLayout &DL = LI->getDataLayout();
+    auto *PtrTy = Addr->getType();
+    auto *StrideTy = DL.getIndexType(PtrTy);
+    // TODO: Support non-unit-reverse strided accesses.
+    auto *StrideVal =
+        ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
+    NewLI = Builder.CreateIntrinsic(
+        Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+        {Addr, StrideVal, Mask, EVL}, nullptr, "wide.strided.load");
   } else {
     VectorBuilder VBuilder(Builder);
     VBuilder.setEVL(EVL).setMask(Mask);
@@ -2724,13 +2757,15 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
   auto *SI = cast<StoreInst>(&Ingredient);
 
   VPValue *StoredVPValue = getStoredValue();
-  bool CreateScatter = !isConsecutive();
+  bool CreateScatter = !isConsecutive() && !isStrided();
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
 
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
 
-  Value *Mask = nullptr;
+  Value *Mask = isStrided()
+                    ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
+                    : nullptr;
   if (auto *VPMask = getMask()) {
     // Mask reversal is only needed for non-all-one (null) masks, as reverse
     // of a null all-one mask is a null mask.
@@ -2749,12 +2784,32 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
   }
   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
   Instruction *NewSI = nullptr;
-  if (CreateScatter)
+  if (CreateScatter) {
     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
-  else if (Mask)
-    NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
-  else
+  } else if (Mask) {
+    if (isStrided()) {
+      const DataLayout &DL = SI->getDataLayout();
+      auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
+      Type *StoredEltTy = StoredVecTy->getElementType();
+      auto *PtrTy = Addr->getType();
+      auto *StrideTy = DL.getIndexType(PtrTy);
+      // TODO: Support non-unit-reverse strided accesses.
+      auto *StrideVal =
+          ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
+      Value *RuntimeVF =
+          getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+      NewSI = Builder.CreateIntrinsic(
+          Intrinsic::experimental_vp_strided_store,
+          {StoredVecTy, PtrTy, StrideTy},
+          {StoredVal, Addr, StrideVal, Mask, RuntimeVF});
+      cast<CallInst>(NewSI)->addParamAttr(
+          1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
+    } else {
+      NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
+    }
+  } else {
     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
+  }
   State.addMetadata(NewSI, SI);
 }
 
@@ -2770,7 +2825,7 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   auto *SI = cast<StoreInst>(&Ingredient);
 
   VPValue *StoredValue = getStoredValue();
-  bool CreateScatter = !isConsecutive();
+  bool CreateScatter = !isConsecutive() && !isStrided();
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
 
   auto &Builder = State.Builder;
@@ -2795,11 +2850,25 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
                                     Intrinsic::vp_scatter,
                                     {StoredVal, Addr, Mask, EVL});
   } else {
-    VectorBuilder VBuilder(Builder);
-    VBuilder.setEVL(EVL).setMask(Mask);
-    NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
-        Instruction::Store, Type::getVoidTy(EVL->getContext()),
-        {StoredVal, Addr}));
+    if (isStrided()) {
+      const DataLayout &DL = SI->getDataLayout();
+      auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
+      Type *StoredEltTy = StoredVecTy->getElementType();
+      auto *PtrTy = Addr->getType();
+      auto *StrideTy = DL.getIndexType(PtrTy);
+      // TODO: Support non-unit-reverse strided accesses.
+      auto *StrideVal =
+          ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
+      NewSI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
+                                      {StoredVecTy, PtrTy, StrideTy},
+                                      {StoredVal, Addr, StrideVal, Mask, EVL});
+    } else {
+      VectorBuilder VBuilder(Builder);
+      VBuilder.setEVL(EVL).setMask(Mask);
+      NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
+          Instruction::Store, Type::getVoidTy(EVL->getContext()),
+          {StoredVal, Addr}));
+    }
   }
   NewSI->addParamAttr(
       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 13ef3029023f1..b90a762f0aad7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -73,13 +73,13 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenLoadRecipe(
               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
-              false /*Consecutive*/, false /*Reverse*/,
+              false /*Consecutive*/, false /*Reverse*/, false /*Strided*/,
               Ingredient.getDebugLoc());
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenStoreRecipe(
               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
               nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
-              Ingredient.getDebugLoc());
+              false /*Strided*/, Ingredient.getDebugLoc());
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 5f73aa43daef9..f0c35169f3bef 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1084,7 +1084,7 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
       new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
   VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
   VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
-  VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {});
+  VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {});
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1195,7 +1195,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
         new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
     VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
     VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
-    VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {});
+    VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {});
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_TRUE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1209,7 +1209,8 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
     VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
     VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
-    VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {});
+    VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, false,
+                              {});
     EXPECT_TRUE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_TRUE(Recipe.mayWriteToMemory());

>From a5079d6010d3fbbfd8b7ad324dda0c9136147ead Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 21 Feb 2025 02:38:30 -0800
Subject: [PATCH 6/8] Step 4: Connect the recipes and cost by instWidening

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  21 +-
 .../RISCV/riscv-vector-reverse-output.ll      | 334 ++++++++++--------
 ...-force-tail-with-evl-reverse-load-store.ll | 197 +----------
 ...orize-force-tail-with-evl-uniform-store.ll |  52 +--
 4 files changed, 215 insertions(+), 389 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index eb468545d2235..329f9cf7d067e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1143,6 +1143,7 @@ class LoopVectorizationCostModel {
     CM_Widen_Reverse, // For consecutive accesses with stride -1.
     CM_Interleave,
     CM_GatherScatter,
+    CM_Strided,
     CM_Scalarize,
     CM_VectorCall,
     CM_IntrinsicCall
@@ -6160,6 +6161,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
                "Expected consecutive stride.");
         InstWidening Decision =
             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+        // Consider using strided load/store for consecutive reverse accesses to
+        // achieve more efficient memory operations.
+        if (ConsecutiveStride == -1) {
+          const InstructionCost StridedLoadStoreCost =
+              isLegalStridedLoadStore(&I, VF) ? getStridedLoadStoreCost(&I, VF)
+                                              : InstructionCost::getInvalid();
+          if (StridedLoadStoreCost < Cost) {
+            Decision = CM_Strided;
+            Cost = StridedLoadStoreCost;
+          }
+        }
         setWideningDecision(&I, VF, Decision, Cost);
         continue;
       }
@@ -6805,6 +6817,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
         return TTI::CastContextHint::Normal;
 
       switch (getWideningDecision(I, VF)) {
+      // TODO: New CastContextHint for strided accesses.
+      case LoopVectorizationCostModel::CM_Strided:
       case LoopVectorizationCostModel::CM_GatherScatter:
         return TTI::CastContextHint::GatherScatter;
       case LoopVectorizationCostModel::CM_Interleave:
@@ -8356,6 +8370,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
   bool Consecutive =
       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+  bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
 
   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
   if (Consecutive) {
@@ -8382,12 +8397,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
     Ptr = VectorPtr;
   }
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
-    return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, false,
-                                 I->getDebugLoc());
+    return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
+                                 Strided, I->getDebugLoc());
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
-                                Reverse, false, I->getDebugLoc());
+                                Reverse, Strided, I->getDebugLoc());
 }
 
 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index a15a8342154ff..1aa01d7aabee9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -53,30 +53,34 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-NEXT:    [[TMP15:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
 ; RV64-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; RV64-NEXT:    [[TMP16:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[N]], i64 0
+; RV64-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; RV64-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; RV64-NEXT:    [[TMP18:%.*]] = mul <vscale x 4 x i32> [[TMP17]], splat (i32 -1)
+; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> [[DOTSPLAT]], [[TMP18]]
+; RV64-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP14]] to i32
+; RV64-NEXT:    [[TMP20:%.*]] = mul i32 -1, [[TMP19]]
+; RV64-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP20]], i64 0
+; RV64-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV64:       [[VECTOR_BODY]]:
 ; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
-; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
-; RV64-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0
-; RV64-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1
-; RV64-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
-; RV64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP19]]
-; RV64-NEXT:    [[TMP21:%.*]] = mul i64 0, [[TMP14]]
-; RV64-NEXT:    [[TMP22:%.*]] = sub i64 1, [[TMP14]]
-; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 [[TMP21]]
-; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]]
-; RV64-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP24]], align 4
-; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[VEC_IND]], splat (i32 -1)
+; RV64-NEXT:    [[TMP22:%.*]] = zext <vscale x 4 x i32> [[TMP21]] to <vscale x 4 x i64>
+; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B]], <vscale x 4 x i64> [[TMP22]]
+; RV64-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 4 x ptr> [[TMP23]], i32 0
+; RV64-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP27]], 4
+; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP24]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP26]])
 ; RV64-NEXT:    [[TMP25:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
-; RV64-NEXT:    [[TMP27:%.*]] = mul i64 0, [[TMP14]]
-; RV64-NEXT:    [[TMP28:%.*]] = sub i64 1, [[TMP14]]
-; RV64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP27]]
-; RV64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i64 [[TMP28]]
-; RV64-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP25]])
-; RV64-NEXT:    store <vscale x 4 x i32> [[REVERSE2]], ptr [[TMP30]], align 4
+; RV64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP22]]
+; RV64-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 4 x ptr> [[TMP28]], i32 0
+; RV64-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP30]], 4
+; RV64-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP25]], ptr align 4 [[TMP29]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP33]])
 ; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT2]]
 ; RV64-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV64-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; RV64:       [[MIDDLE_BLOCK]]:
@@ -123,34 +127,36 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV32-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; RV32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; RV32-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; RV32-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
-; RV32-NEXT:    [[TMP8:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV32-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[N]], i64 0
+; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; RV32-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; RV32-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 -1)
+; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> [[DOTSPLAT]], [[TMP10]]
+; RV32-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32
+; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP13]]
+; RV32-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP12]], i64 0
+; RV32-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV32:       [[VECTOR_BODY]]:
 ; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
-; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
-; RV32-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 0
-; RV32-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
-; RV32-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
-; RV32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]]
-; RV32-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32
-; RV32-NEXT:    [[TMP14:%.*]] = mul i32 0, [[TMP13]]
-; RV32-NEXT:    [[TMP15:%.*]] = sub i32 1, [[TMP13]]
-; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[TMP14]]
-; RV32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP15]]
-; RV32-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4
-; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV32-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[VEC_IND]], splat (i32 -1)
+; RV32-NEXT:    [[TMP14:%.*]] = zext <vscale x 4 x i32> [[TMP19]] to <vscale x 4 x i64>
+; RV32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], <vscale x 4 x i64> [[TMP14]]
+; RV32-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x ptr> [[TMP15]], i32 0
+; RV32-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT:    [[TMP24:%.*]] = mul i32 [[TMP17]], 4
+; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP16]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP24]])
 ; RV32-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
-; RV32-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP6]] to i32
-; RV32-NEXT:    [[TMP21:%.*]] = mul i32 0, [[TMP20]]
-; RV32-NEXT:    [[TMP22:%.*]] = sub i32 1, [[TMP20]]
-; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 [[TMP21]]
-; RV32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 [[TMP22]]
-; RV32-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
-; RV32-NEXT:    store <vscale x 4 x i32> [[REVERSE2]], ptr [[TMP24]], align 4
+; RV32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP14]]
+; RV32-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x ptr> [[TMP20]], i32 0
+; RV32-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT:    [[TMP23:%.*]] = mul i32 [[TMP22]], 4
+; RV32-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP21]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP23]])
 ; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT2]]
 ; RV32-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV32-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; RV32:       [[MIDDLE_BLOCK]]:
@@ -158,7 +164,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV32-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; RV32:       [[SCALAR_PH]]:
 ; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
-; RV32-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
 ; RV32-NEXT:    br label %[[FOR_BODY:.*]]
 ; RV32:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
 ; RV32-NEXT:    br label %[[FOR_COND_CLEANUP]]
@@ -211,45 +217,50 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-UF2-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
 ; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
 ; RV64-UF2-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; RV64-UF2-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
-; RV64-UF2-NEXT:    [[TMP17:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-UF2-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV64-UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP14]], i64 0
+; RV64-UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = trunc <vscale x 4 x i64> [[BROADCAST_SPLAT]] to <vscale x 4 x i32>
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = mul <vscale x 4 x i32> [[TMP18]], splat (i32 -1)
+; RV64-UF2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[N]], i64 0
+; RV64-UF2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP20]], splat (i32 -1)
+; RV64-UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> [[DOTSPLAT]], [[TMP21]]
 ; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV64-UF2:       [[VECTOR_BODY]]:
 ; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
-; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
-; RV64-UF2-NEXT:    [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 0
-; RV64-UF2-NEXT:    [[TMP19:%.*]] = add nsw i32 [[TMP18]], -1
-; RV64-UF2-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
-; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP20]]
-; RV64-UF2-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP27:%.*]] = sub i64 1, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
-; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP25]], align 4
-; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV64-UF2-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[TMP29]], align 4
-; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD2]])
+; RV64-UF2-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i32> [[VEC_IND]], [[TMP19]]
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = add nsw <vscale x 4 x i32> [[VEC_IND]], splat (i32 -1)
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = add nsw <vscale x 4 x i32> [[STEP_ADD]], splat (i32 -1)
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = zext <vscale x 4 x i32> [[TMP22]] to <vscale x 4 x i64>
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = zext <vscale x 4 x i32> [[TMP23]] to <vscale x 4 x i64>
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], <vscale x 4 x i64> [[TMP24]]
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[B]], <vscale x 4 x i64> [[TMP25]]
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x ptr> [[TMP26]], i32 0
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = mul i32 [[TMP29]], 4
+; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP28]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP34]])
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = extractelement <vscale x 4 x ptr> [[TMP27]], i32 0
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP32]], 4
+; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP35]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP33]])
 ; RV64-UF2-NEXT:    [[TMP30:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
 ; RV64-UF2-NEXT:    [[TMP31:%.*]] = add <vscale x 4 x i32> [[REVERSE3]], splat (i32 1)
-; RV64-UF2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
-; RV64-UF2-NEXT:    [[TMP33:%.*]] = mul i64 0, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP34:%.*]] = sub i64 1, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP33]]
-; RV64-UF2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP35]], i64 [[TMP34]]
-; RV64-UF2-NEXT:    [[TMP37:%.*]] = mul i64 -1, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP38:%.*]] = sub i64 1, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i64 [[TMP37]]
-; RV64-UF2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i64 [[TMP38]]
-; RV64-UF2-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP30]])
-; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP36]], align 4
-; RV64-UF2-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP31]])
-; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE5]], ptr [[TMP40]], align 4
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP24]]
+; RV64-UF2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP25]]
+; RV64-UF2-NEXT:    [[TMP38:%.*]] = extractelement <vscale x 4 x ptr> [[TMP36]], i32 0
+; RV64-UF2-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], 4
+; RV64-UF2-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP30]], ptr align 4 [[TMP38]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP40]])
+; RV64-UF2-NEXT:    [[TMP44:%.*]] = extractelement <vscale x 4 x ptr> [[TMP37]], i32 0
+; RV64-UF2-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT:    [[TMP43:%.*]] = mul i32 [[TMP45]], 4
+; RV64-UF2-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP31]], ptr align 4 [[TMP44]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]])
 ; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; RV64-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[STEP_ADD]], [[TMP19]]
 ; RV64-UF2-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV64-UF2-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; RV64-UF2:       [[MIDDLE_BLOCK]]:
@@ -257,7 +268,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; RV64-UF2:       [[SCALAR_PH]]:
 ; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
-; RV64-UF2-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
 ; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
 ; RV64-UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
 ; RV64-UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
@@ -265,7 +276,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-UF2-NEXT:    ret void
 ; RV64-UF2:       [[FOR_BODY]]:
 ; RV64-UF2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
 ; RV64-UF2-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
 ; RV64-UF2-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
 ; RV64-UF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
@@ -338,30 +349,34 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-NEXT:    [[TMP15:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
 ; RV64-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; RV64-NEXT:    [[TMP16:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[N]], i64 0
+; RV64-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; RV64-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; RV64-NEXT:    [[TMP18:%.*]] = mul <vscale x 4 x i32> [[TMP17]], splat (i32 -1)
+; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> [[DOTSPLAT]], [[TMP18]]
+; RV64-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP14]] to i32
+; RV64-NEXT:    [[TMP20:%.*]] = mul i32 -1, [[TMP19]]
+; RV64-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP20]], i64 0
+; RV64-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV64:       [[VECTOR_BODY]]:
 ; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
-; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
-; RV64-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0
-; RV64-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1
-; RV64-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
-; RV64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
-; RV64-NEXT:    [[TMP21:%.*]] = mul i64 0, [[TMP14]]
-; RV64-NEXT:    [[TMP22:%.*]] = sub i64 1, [[TMP14]]
-; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
-; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP22]]
-; RV64-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP24]], align 4
-; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[VEC_IND]], splat (i32 -1)
+; RV64-NEXT:    [[TMP22:%.*]] = zext <vscale x 4 x i32> [[TMP21]] to <vscale x 4 x i64>
+; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], <vscale x 4 x i64> [[TMP22]]
+; RV64-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 4 x ptr> [[TMP23]], i32 0
+; RV64-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP27]], 4
+; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP24]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP26]])
 ; RV64-NEXT:    [[TMP25:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
-; RV64-NEXT:    [[TMP27:%.*]] = mul i64 0, [[TMP14]]
-; RV64-NEXT:    [[TMP28:%.*]] = sub i64 1, [[TMP14]]
-; RV64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP27]]
-; RV64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP28]]
-; RV64-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP25]])
-; RV64-NEXT:    store <vscale x 4 x float> [[REVERSE2]], ptr [[TMP30]], align 4
+; RV64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[A]], <vscale x 4 x i64> [[TMP22]]
+; RV64-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 4 x ptr> [[TMP28]], i32 0
+; RV64-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP30]], 4
+; RV64-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP25]], ptr align 4 [[TMP29]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP33]])
 ; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT2]]
 ; RV64-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV64-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; RV64:       [[MIDDLE_BLOCK]]:
@@ -408,34 +423,36 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV32-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; RV32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; RV32-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; RV32-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
-; RV32-NEXT:    [[TMP8:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV32-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[N]], i64 0
+; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; RV32-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; RV32-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 -1)
+; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> [[DOTSPLAT]], [[TMP10]]
+; RV32-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32
+; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP13]]
+; RV32-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP12]], i64 0
+; RV32-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV32:       [[VECTOR_BODY]]:
 ; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
-; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
-; RV32-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 0
-; RV32-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
-; RV32-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
-; RV32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
-; RV32-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32
-; RV32-NEXT:    [[TMP14:%.*]] = mul i32 0, [[TMP13]]
-; RV32-NEXT:    [[TMP15:%.*]] = sub i32 1, [[TMP13]]
-; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP14]]
-; RV32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP15]]
-; RV32-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
-; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV32-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[VEC_IND]], splat (i32 -1)
+; RV32-NEXT:    [[TMP14:%.*]] = zext <vscale x 4 x i32> [[TMP19]] to <vscale x 4 x i64>
+; RV32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], <vscale x 4 x i64> [[TMP14]]
+; RV32-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x ptr> [[TMP15]], i32 0
+; RV32-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT:    [[TMP24:%.*]] = mul i32 [[TMP17]], 4
+; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr align 4 [[TMP16]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP24]])
 ; RV32-NEXT:    [[TMP18:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
-; RV32-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP6]] to i32
-; RV32-NEXT:    [[TMP21:%.*]] = mul i32 0, [[TMP20]]
-; RV32-NEXT:    [[TMP22:%.*]] = sub i32 1, [[TMP20]]
-; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 [[TMP21]]
-; RV32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP22]]
-; RV32-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP18]])
-; RV32-NEXT:    store <vscale x 4 x float> [[REVERSE2]], ptr [[TMP24]], align 4
+; RV32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], <vscale x 4 x i64> [[TMP14]]
+; RV32-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x ptr> [[TMP20]], i32 0
+; RV32-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT:    [[TMP23:%.*]] = mul i32 [[TMP22]], 4
+; RV32-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i32(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP21]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP23]])
 ; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT2]]
 ; RV32-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV32-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; RV32:       [[MIDDLE_BLOCK]]:
@@ -443,7 +460,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV32-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; RV32:       [[SCALAR_PH]]:
 ; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
-; RV32-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
 ; RV32-NEXT:    br label %[[FOR_BODY:.*]]
 ; RV32:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
 ; RV32-NEXT:    br label %[[FOR_COND_CLEANUP]]
@@ -496,45 +513,50 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-UF2-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
 ; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
 ; RV64-UF2-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; RV64-UF2-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
-; RV64-UF2-NEXT:    [[TMP17:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-UF2-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV64-UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP14]], i64 0
+; RV64-UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = trunc <vscale x 4 x i64> [[BROADCAST_SPLAT]] to <vscale x 4 x i32>
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = mul <vscale x 4 x i32> [[TMP18]], splat (i32 -1)
+; RV64-UF2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[N]], i64 0
+; RV64-UF2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP20]], splat (i32 -1)
+; RV64-UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> [[DOTSPLAT]], [[TMP21]]
 ; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV64-UF2:       [[VECTOR_BODY]]:
 ; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
-; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
-; RV64-UF2-NEXT:    [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 0
-; RV64-UF2-NEXT:    [[TMP19:%.*]] = add nsw i32 [[TMP18]], -1
-; RV64-UF2-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
-; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP20]]
-; RV64-UF2-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP27:%.*]] = sub i64 1, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
-; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP25]], align 4
-; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV64-UF2-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP29]], align 4
-; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD2]])
+; RV64-UF2-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i32> [[VEC_IND]], [[TMP19]]
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = add nsw <vscale x 4 x i32> [[VEC_IND]], splat (i32 -1)
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = add nsw <vscale x 4 x i32> [[STEP_ADD]], splat (i32 -1)
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = zext <vscale x 4 x i32> [[TMP22]] to <vscale x 4 x i64>
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = zext <vscale x 4 x i32> [[TMP23]] to <vscale x 4 x i64>
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[B]], <vscale x 4 x i64> [[TMP24]]
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[B]], <vscale x 4 x i64> [[TMP25]]
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x ptr> [[TMP26]], i32 0
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = mul i32 [[TMP29]], 4
+; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP28]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP34]])
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = extractelement <vscale x 4 x ptr> [[TMP27]], i32 0
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP32]], 4
+; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP35]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP33]])
 ; RV64-UF2-NEXT:    [[TMP30:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
 ; RV64-UF2-NEXT:    [[TMP31:%.*]] = fadd <vscale x 4 x float> [[REVERSE3]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]]
-; RV64-UF2-NEXT:    [[TMP33:%.*]] = mul i64 0, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP34:%.*]] = sub i64 1, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP33]]
-; RV64-UF2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP35]], i64 [[TMP34]]
-; RV64-UF2-NEXT:    [[TMP37:%.*]] = mul i64 -1, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP38:%.*]] = sub i64 1, [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP32]], i64 [[TMP37]]
-; RV64-UF2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP39]], i64 [[TMP38]]
-; RV64-UF2-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP30]])
-; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP36]], align 4
-; RV64-UF2-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP31]])
-; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE5]], ptr [[TMP40]], align 4
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[A]], <vscale x 4 x i64> [[TMP24]]
+; RV64-UF2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[A]], <vscale x 4 x i64> [[TMP25]]
+; RV64-UF2-NEXT:    [[TMP38:%.*]] = extractelement <vscale x 4 x ptr> [[TMP36]], i32 0
+; RV64-UF2-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], 4
+; RV64-UF2-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP30]], ptr align 4 [[TMP38]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP40]])
+; RV64-UF2-NEXT:    [[TMP44:%.*]] = extractelement <vscale x 4 x ptr> [[TMP37]], i32 0
+; RV64-UF2-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vscale.i32()
+; RV64-UF2-NEXT:    [[TMP43:%.*]] = mul i32 [[TMP45]], 4
+; RV64-UF2-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP31]], ptr align 4 [[TMP44]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]])
 ; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; RV64-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[STEP_ADD]], [[TMP19]]
 ; RV64-UF2-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV64-UF2-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; RV64-UF2:       [[MIDDLE_BLOCK]]:
@@ -542,7 +564,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; RV64-UF2:       [[SCALAR_PH]]:
 ; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
-; RV64-UF2-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
 ; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
 ; RV64-UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
 ; RV64-UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
@@ -550,7 +572,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-UF2-NEXT:    ret void
 ; RV64-UF2:       [[FOR_BODY]]:
 ; RV64-UF2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
 ; RV64-UF2-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
 ; RV64-UF2-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
 ; RV64-UF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 5b579b0749c67..622f206743a7c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -12,65 +12,18 @@
 define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %ptr2) {
 ; IF-EVL-LABEL: @reverse_load_store(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
-; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
-; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
-; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; IF-EVL:       vector.body:
-; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
-; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], -1
-; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 0, [[TMP18]]
-; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 1, [[TMP18]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 0, [[TMP19]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = sub i64 1, [[TMP19]]
-; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
-; IF-EVL-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; IF-EVL-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
-; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
-; IF-EVL-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL:%.*]], [[SCALAR_PH:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
-; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[ADD]]
 ; IF-EVL-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
-; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[ADD]]
 ; IF-EVL-NEXT:    store i32 [[TMP]], ptr [[GEPS]], align 4
 ; IF-EVL-NEXT:    [[INC]] = add i32 [[I]], 1
 ; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
-; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND:%.*]]
 ; IF-EVL:       loopend:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -113,87 +66,25 @@ loopend:
 define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noalias %ptr1, ptr noalias %ptr2) {
 ; IF-EVL-LABEL: @reverse_load_store_masked(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
-; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
-; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
-; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; IF-EVL:       vector.body:
-; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
-; IF-EVL-NEXT:    [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
-; IF-EVL-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX3]], 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], splat (i64 1023)
-; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[TMP6]], -1
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP7]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
-; IF-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT:    [[TMP17:%.*]] = mul i64 0, [[TMP26]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = sub i64 1, [[TMP26]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT:    [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP27]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP27]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
-; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT:    [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; IF-EVL-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
-; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
-; IF-EVL-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
-; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC]] ]
+; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL:%.*]], [[SCALAR_PH:%.*]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
+; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC]] ]
 ; IF-EVL-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
-; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i32 [[I]]
+; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[I]]
 ; IF-EVL-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
 ; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP]], 100
 ; IF-EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; IF-EVL:       if.then:
-; IF-EVL-NEXT:    [[GEPL1:%.*]] = getelementptr inbounds i32, ptr [[PTR1]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[GEPL1:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[ADD]]
 ; IF-EVL-NEXT:    [[V:%.*]] = load i32, ptr [[GEPL1]], align 4
-; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[ADD]]
 ; IF-EVL-NEXT:    store i32 [[V]], ptr [[GEPS]], align 4
 ; IF-EVL-NEXT:    br label [[FOR_INC]]
 ; IF-EVL:       for.inc:
 ; IF-EVL-NEXT:    [[INC]] = add i32 [[I]], 1
 ; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
-; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND:%.*]]
 ; IF-EVL:       loopend:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -255,74 +146,20 @@ loopend:
 define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d) {
 ; IF-EVL-LABEL: @multiple_reverse_vector_pointer(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
-; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
-; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 1024, [[N_VEC]]
-; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; IF-EVL:       vector.body:
-; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
-; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
-; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP9]]
-; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 1, [[TMP9]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT:    [[TMP17:%.*]] = mul i64 0, [[TMP16]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = sub i64 1, [[TMP16]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 0, [[TMP22]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = sub i64 1, [[TMP22]]
-; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
-; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
-; IF-EVL-NEXT:    [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE2]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; IF-EVL-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 1024, [[ENTRY:%.*]] ]
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
 ; IF-EVL:       loop:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; IF-EVL-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[IV]]
 ; IF-EVL-NEXT:    [[X:%.*]] = load i8, ptr [[GEP_A]], align 1
-; IF-EVL-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i8 [[X]]
+; IF-EVL-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B:%.*]], i8 [[X]]
 ; IF-EVL-NEXT:    [[Y:%.*]] = load i8, ptr [[GEP_B]], align 1
-; IF-EVL-NEXT:    [[GEP_C:%.*]] = getelementptr i8, ptr [[C]], i64 [[IV]]
+; IF-EVL-NEXT:    [[GEP_C:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[IV]]
 ; IF-EVL-NEXT:    store i8 [[Y]], ptr [[GEP_C]], align 1
-; IF-EVL-NEXT:    [[GEP_D:%.*]] = getelementptr i8, ptr [[D]], i64 [[IV]]
+; IF-EVL-NEXT:    [[GEP_D:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[IV]]
 ; IF-EVL-NEXT:    store i8 [[Y]], ptr [[GEP_D]], align 1
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
 ; IF-EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 0
-; IF-EVL-NEXT:    br i1 [[CMP_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 0829bab26f062..86bf440657f5b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -12,57 +12,15 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 3, [[SPEC_SELECT]]
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 -1, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP0]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 2, i1 true)
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = sub nuw nsw i64 1, [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 0, [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = sub i64 1, [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]]
-; CHECK-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP20]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[TMP23:%.*]] = sub nuw nsw i64 1, [[IV]]
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP23]]
 ; CHECK-NEXT:    store i64 0, ptr [[ARRAYIDX13]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -83,9 +41,3 @@ loop:
 exit:
   ret void
 }
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-;.

>From 4ccf78c043cc023e43fe7be9a61825fba87a1012 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 25 Feb 2025 06:03:19 -0800
Subject: [PATCH 7/8] Step 4.1: Update debug test case

---
 .../RISCV/riscv-vector-reverse.ll             | 202 ++++++++----------
 1 file changed, 89 insertions(+), 113 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 4e862bf2f7480..8c2d6cc8fb47e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -23,27 +23,21 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Scalable vectorization is available
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
 ; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 4294967295.
-; CHECK-NEXT:  LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:  LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
+; CHECK-NEXT:  LV: Found uniform instruction:   %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:  LV: Found uniform instruction:   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:  LV: Found uniform instruction:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %add9 = add i32 %1, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV: Using user VF vscale x 4.
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK:       LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
@@ -71,17 +65,14 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
 ; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:      vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN ir<%1> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      ir<%i.0.in8> = WIDEN-INDUCTION  ir<%n>, ir<-1>, vp<[[VF]]>
+; CHECK-NEXT:      WIDEN ir<%i.0> = add nsw ir<%i.0.in8>, ir<-1>
+; CHECK-NEXT:      WIDEN-CAST ir<%idxprom> = zext ir<%i.0> to i64
+; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:      WIDEN ir<%1> = load ir<%arrayidx>
 ; CHECK-NEXT:      WIDEN ir<%add9> = add ir<%1>, ir<1>
-; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR2:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
+; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:      WIDEN store ir<%arrayidx3>, ir<%add9>
 ; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
@@ -107,18 +98,18 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %add9 = add i32 %1, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV(REG): Calculating max register usage:
 ; CHECK-NEXT:  LV(REG): At #0 Interval # 0
 ; CHECK-NEXT:  LV(REG): At #1 Interval # 1
@@ -132,14 +123,14 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): At #10 Interval # 2
 ; CHECK-NEXT:  LV(REG): VF = vscale x 4
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 32
+; CHECK-NEXT:  LV: Loop cost is 26
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
@@ -194,17 +185,14 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
 ; CHECK-NEXT:      SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:      vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
-; CHECK-NEXT:      WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:      WIDEN ir<%add9> = add ir<[[L]]>, ir<1>
-; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR2:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
-; CHECK-NEXT:      WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
+; CHECK-NEXT:      ir<%i.0.in8> = WIDEN-INDUCTION  ir<%n>, ir<-1>, ir<%18>
+; CHECK-NEXT:      WIDEN ir<%i.0> = add nsw ir<%i.0.in8>, ir<-1>
+; CHECK-NEXT:      WIDEN-CAST ir<%idxprom> = zext ir<%i.0> to i64
+; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:      WIDEN ir<[[LD:%.+]]> = load ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%add9> = add ir<[[LD]]>, ir<1>
+; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:      WIDEN store ir<%arrayidx3>, ir<%add9>
 ; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
 ; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
@@ -272,27 +260,21 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Scalable vectorization is available
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
 ; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 4294967295.
-; CHECK-NEXT:  LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:  LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
+; CHECK-NEXT:  LV: Found uniform instruction:   %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:  LV: Found uniform instruction:   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:  LV: Found uniform instruction:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %conv1 = fadd float %1, 1.000000e+00
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV: Using user VF vscale x 4.
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK:       LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
@@ -320,17 +302,14 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
 ; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:      vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN ir<%1> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      ir<%i.0.in8> = WIDEN-INDUCTION  ir<%n>, ir<-1>, vp<[[VF]]>
+; CHECK-NEXT:      WIDEN ir<%i.0> = add nsw ir<%i.0.in8>, ir<-1>
+; CHECK-NEXT:      WIDEN-CAST ir<%idxprom> = zext ir<%i.0> to i64
+; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:      WIDEN ir<%1> = load ir<%arrayidx>
 ; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
-; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR2:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1>
+; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:      WIDEN store ir<%arrayidx3>, ir<%conv1>
 ; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
@@ -356,18 +335,18 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %conv1 = fadd float %1, 1.000000e+00
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV(REG): Calculating max register usage:
 ; CHECK-NEXT:  LV(REG): At #0 Interval # 0
 ; CHECK-NEXT:  LV(REG): At #1 Interval # 1
@@ -381,14 +360,14 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): At #10 Interval # 2
 ; CHECK-NEXT:  LV(REG): VF = vscale x 4
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 34
+; CHECK-NEXT:  LV: Loop cost is 28
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
@@ -443,17 +422,14 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
 ; CHECK-NEXT:      SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:      vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
-; CHECK-NEXT:      WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      ir<%i.0.in8> = WIDEN-INDUCTION  ir<%n>, ir<-1>, ir<%18>
+; CHECK-NEXT:      WIDEN ir<%i.0> = add nsw ir<%i.0.in8>, ir<-1>
+; CHECK-NEXT:      WIDEN-CAST ir<%idxprom> = zext ir<%i.0> to i64
+; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:      WIDEN ir<[[L:%.+]]> = load ir<%arrayidx>
 ; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00>
-; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
-; CHECK-NEXT:      WIDEN store vp<[[VEC_PTR]]>, ir<%conv1>
+; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:      WIDEN store ir<%arrayidx3>, ir<%conv1>
 ; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
 ; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors

>From 1836f610d09f258dc3f8aa555ac44ea5783ad418 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 25 Feb 2025 23:26:03 -0800
Subject: [PATCH 8/8] Opt: Remove dependency on VPWidenIntOrFpInductionRecipe
 and expend VPVectorPointerRecipe

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  20 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  11 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   4 +-
 .../RISCV/riscv-vector-reverse-output.ll      | 228 +++++++-----------
 .../RISCV/riscv-vector-reverse.ll             | 130 +++++-----
 ...-force-tail-with-evl-reverse-load-store.ll | 175 ++++++++++++--
 ...orize-force-tail-with-evl-uniform-store.ll |  55 ++++-
 7 files changed, 395 insertions(+), 228 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 329f9cf7d067e..bf56101977b98 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3627,9 +3627,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     if (IsUniformMemOpUse(I))
       return true;
 
-    return (WideningDecision == CM_Widen ||
-            WideningDecision == CM_Widen_Reverse ||
-            WideningDecision == CM_Interleave);
+    return (
+        WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
+        WideningDecision == CM_Strided || WideningDecision == CM_Interleave);
   };
 
   // Returns true if Ptr is the pointer operand of a memory access instruction
@@ -8367,17 +8367,27 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   // reverse consecutive.
   LoopVectorizationCostModel::InstWidening Decision =
       CM.getWideningDecision(I, Range.Start);
+
+  auto SameWiden = [&](ElementCount VF) -> bool {
+    return Decision == CM.getWideningDecision(I, VF);
+  };
+  bool ContainsWidenVF =
+      LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range);
+  assert(ContainsWidenVF &&
+         "At least widen the memory accesses by the Start VF.");
+
   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
   bool Consecutive =
       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
   bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
 
   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
-  if (Consecutive) {
+  if (Consecutive || Strided) {
     auto *GEP = dyn_cast<GetElementPtrInst>(
         Ptr->getUnderlyingValue()->stripPointerCasts());
     VPSingleDefRecipe *VectorPtr;
     if (Reverse) {
+      assert(!Strided && "Reverse and Strided are mutually exclusive.");
       // When folding the tail, we may compute an address that we don't in the
       // original scalar loop and it may not be inbounds. Drop Inbounds in that
       // case.
@@ -8388,7 +8398,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
       VectorPtr = new VPReverseVectorPointerRecipe(
           Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
     } else {
-      VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
+      VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), Strided,
                                             GEP ? GEP->getNoWrapFlags()
                                                 : GEPNoWrapFlags::none(),
                                             I->getDebugLoc());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 266f070b91a7b..ad74ebda7ad05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1568,12 +1568,15 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
                               public VPUnrollPartAccessor<1> {
   Type *IndexedTy;
 
+  /// Indicate whether to compute the pointer for strided memory accesses.
+  bool Strided;
+
 public:
-  VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags,
-                        DebugLoc DL)
+  VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided,
+                        GEPNoWrapFlags GEPFlags, DebugLoc DL)
       : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
                             GEPFlags, DL),
-        IndexedTy(IndexedTy) {}
+        IndexedTy(IndexedTy), Strided(Strided) {}
 
   VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
 
@@ -1594,7 +1597,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
   }
 
   VPVectorPointerRecipe *clone() override {
-    return new VPVectorPointerRecipe(getOperand(0), IndexedTy,
+    return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided,
                                      getGEPNoWrapFlags(), getDebugLoc());
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bf2e4867b56d1..7eb3e3b3c835e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2115,7 +2115,9 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
                                 CurrentPart, Builder);
   Value *Ptr = State.get(getOperand(0), VPLane(0));
 
-  Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
+  // TODO: Support non-unit-reverse strided accesses.
+  int64_t Step = Strided ? -1 * CurrentPart : CurrentPart;
+  Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Step);
   Value *ResultPtr =
       Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index 1aa01d7aabee9..c8a2a9ad1c613 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -53,34 +53,26 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-NEXT:    [[TMP15:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
 ; RV64-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; RV64-NEXT:    [[TMP16:%.*]] = sub i32 [[N]], [[DOTCAST]]
-; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[N]], i64 0
-; RV64-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; RV64-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; RV64-NEXT:    [[TMP18:%.*]] = mul <vscale x 4 x i32> [[TMP17]], splat (i32 -1)
-; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> [[DOTSPLAT]], [[TMP18]]
-; RV64-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP14]] to i32
-; RV64-NEXT:    [[TMP20:%.*]] = mul i32 -1, [[TMP19]]
-; RV64-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP20]], i64 0
-; RV64-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV64:       [[VECTOR_BODY]]:
 ; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[VEC_IND]], splat (i32 -1)
-; RV64-NEXT:    [[TMP22:%.*]] = zext <vscale x 4 x i32> [[TMP21]] to <vscale x 4 x i64>
-; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B]], <vscale x 4 x i64> [[TMP22]]
-; RV64-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 4 x ptr> [[TMP23]], i32 0
+; RV64-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV64-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV64-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1
+; RV64-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP19]]
+; RV64-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
 ; RV64-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
 ; RV64-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP27]], 4
-; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP24]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP26]])
+; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP34]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP26]])
 ; RV64-NEXT:    [[TMP25:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP22]]
-; RV64-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 4 x ptr> [[TMP28]], i32 0
+; RV64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
+; RV64-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 0
 ; RV64-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
 ; RV64-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP30]], 4
-; RV64-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP25]], ptr align 4 [[TMP29]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP33]])
+; RV64-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP25]], ptr align 4 [[TMP35]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP33]])
 ; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
-; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT2]]
 ; RV64-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV64-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; RV64:       [[MIDDLE_BLOCK]]:
@@ -88,7 +80,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; RV64:       [[SCALAR_PH]]:
 ; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
-; RV64-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
 ; RV64-NEXT:    br label %[[FOR_BODY:.*]]
 ; RV64:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
 ; RV64-NEXT:    br label %[[FOR_COND_CLEANUP]]
@@ -96,7 +88,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-NEXT:    ret void
 ; RV64:       [[FOR_BODY]]:
 ; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
 ; RV64-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
 ; RV64-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
 ; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
@@ -129,34 +121,26 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV32-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
 ; RV32-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
-; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[N]], i64 0
-; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; RV32-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; RV32-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 -1)
-; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> [[DOTSPLAT]], [[TMP10]]
-; RV32-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32
-; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP13]]
-; RV32-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP12]], i64 0
-; RV32-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV32:       [[VECTOR_BODY]]:
 ; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[VEC_IND]], splat (i32 -1)
-; RV32-NEXT:    [[TMP14:%.*]] = zext <vscale x 4 x i32> [[TMP19]] to <vscale x 4 x i64>
-; RV32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], <vscale x 4 x i64> [[TMP14]]
-; RV32-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x ptr> [[TMP15]], i32 0
+; RV32-NEXT:    [[DOTCAST2:%.*]] = trunc i64 [[INDEX]] to i32
+; RV32-NEXT:    [[OFFSET_IDX1:%.*]] = sub i32 [[N]], [[DOTCAST2]]
+; RV32-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX1]], 0
+; RV32-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
+; RV32-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]]
+; RV32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
 ; RV32-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
 ; RV32-NEXT:    [[TMP24:%.*]] = mul i32 [[TMP17]], 4
-; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP16]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP24]])
+; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP27]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP24]])
 ; RV32-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP14]]
-; RV32-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x ptr> [[TMP20]], i32 0
+; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; RV32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
 ; RV32-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
 ; RV32-NEXT:    [[TMP23:%.*]] = mul i32 [[TMP22]], 4
-; RV32-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP21]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP23]])
+; RV32-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP28]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP23]])
 ; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT2]]
 ; RV32-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV32-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; RV32:       [[MIDDLE_BLOCK]]:
@@ -164,7 +148,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV32-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; RV32:       [[SCALAR_PH]]:
 ; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
-; RV32-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
 ; RV32-NEXT:    br label %[[FOR_BODY:.*]]
 ; RV32:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
 ; RV32-NEXT:    br label %[[FOR_COND_CLEANUP]]
@@ -172,7 +156,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV32-NEXT:    ret void
 ; RV32:       [[FOR_BODY]]:
 ; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
 ; RV32-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
 ; RV32-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
 ; RV32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
@@ -214,53 +198,43 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]]
 ; RV64-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; RV64-UF2-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
-; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 8
 ; RV64-UF2-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
 ; RV64-UF2-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
-; RV64-UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP14]], i64 0
-; RV64-UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; RV64-UF2-NEXT:    [[TMP18:%.*]] = trunc <vscale x 4 x i64> [[BROADCAST_SPLAT]] to <vscale x 4 x i32>
-; RV64-UF2-NEXT:    [[TMP19:%.*]] = mul <vscale x 4 x i32> [[TMP18]], splat (i32 -1)
-; RV64-UF2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[N]], i64 0
-; RV64-UF2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; RV64-UF2-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; RV64-UF2-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP20]], splat (i32 -1)
-; RV64-UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> [[DOTSPLAT]], [[TMP21]]
 ; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV64-UF2:       [[VECTOR_BODY]]:
 ; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i32> [[VEC_IND]], [[TMP19]]
-; RV64-UF2-NEXT:    [[TMP22:%.*]] = add nsw <vscale x 4 x i32> [[VEC_IND]], splat (i32 -1)
-; RV64-UF2-NEXT:    [[TMP23:%.*]] = add nsw <vscale x 4 x i32> [[STEP_ADD]], splat (i32 -1)
-; RV64-UF2-NEXT:    [[TMP24:%.*]] = zext <vscale x 4 x i32> [[TMP22]] to <vscale x 4 x i64>
-; RV64-UF2-NEXT:    [[TMP25:%.*]] = zext <vscale x 4 x i32> [[TMP23]] to <vscale x 4 x i64>
-; RV64-UF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], <vscale x 4 x i64> [[TMP24]]
-; RV64-UF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[B]], <vscale x 4 x i64> [[TMP25]]
-; RV64-UF2-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x ptr> [[TMP26]], i32 0
+; RV64-UF2-NEXT:    [[DOTCAST2:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-UF2-NEXT:    [[OFFSET_IDX1:%.*]] = sub i32 [[N]], [[DOTCAST2]]
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX1]], 0
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP19]]
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP36]], 4294967292
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP46]]
 ; RV64-UF2-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
 ; RV64-UF2-NEXT:    [[TMP34:%.*]] = mul i32 [[TMP29]], 4
 ; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP28]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP34]])
-; RV64-UF2-NEXT:    [[TMP35:%.*]] = extractelement <vscale x 4 x ptr> [[TMP27]], i32 0
 ; RV64-UF2-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vscale.i32()
 ; RV64-UF2-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP32]], 4
 ; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP35]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP33]])
 ; RV64-UF2-NEXT:    [[TMP30:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
 ; RV64-UF2-NEXT:    [[TMP31:%.*]] = add <vscale x 4 x i32> [[REVERSE3]], splat (i32 1)
-; RV64-UF2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP24]]
-; RV64-UF2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP25]]
-; RV64-UF2-NEXT:    [[TMP38:%.*]] = extractelement <vscale x 4 x ptr> [[TMP36]], i32 0
+; RV64-UF2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
+; RV64-UF2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP47]], i32 0
+; RV64-UF2-NEXT:    [[TMP48:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP49:%.*]] = mul i64 [[TMP48]], 4294967292
+; RV64-UF2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP47]], i64 [[TMP49]]
 ; RV64-UF2-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vscale.i32()
 ; RV64-UF2-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], 4
 ; RV64-UF2-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP30]], ptr align 4 [[TMP38]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP40]])
-; RV64-UF2-NEXT:    [[TMP44:%.*]] = extractelement <vscale x 4 x ptr> [[TMP37]], i32 0
 ; RV64-UF2-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vscale.i32()
 ; RV64-UF2-NEXT:    [[TMP43:%.*]] = mul i32 [[TMP45]], 4
 ; RV64-UF2-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[TMP31]], ptr align 4 [[TMP44]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]])
 ; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; RV64-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[STEP_ADD]], [[TMP19]]
 ; RV64-UF2-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV64-UF2-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; RV64-UF2:       [[MIDDLE_BLOCK]]:
@@ -268,7 +242,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; RV64-UF2:       [[SCALAR_PH]]:
 ; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
-; RV64-UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
 ; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
 ; RV64-UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
 ; RV64-UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
@@ -276,7 +250,7 @@ define void @vector_reverse_i64(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-UF2-NEXT:    ret void
 ; RV64-UF2:       [[FOR_BODY]]:
 ; RV64-UF2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
 ; RV64-UF2-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
 ; RV64-UF2-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
 ; RV64-UF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]]
@@ -349,34 +323,26 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-NEXT:    [[TMP15:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
 ; RV64-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; RV64-NEXT:    [[TMP16:%.*]] = sub i32 [[N]], [[DOTCAST]]
-; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[N]], i64 0
-; RV64-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; RV64-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; RV64-NEXT:    [[TMP18:%.*]] = mul <vscale x 4 x i32> [[TMP17]], splat (i32 -1)
-; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> [[DOTSPLAT]], [[TMP18]]
-; RV64-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP14]] to i32
-; RV64-NEXT:    [[TMP20:%.*]] = mul i32 -1, [[TMP19]]
-; RV64-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP20]], i64 0
-; RV64-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV64:       [[VECTOR_BODY]]:
 ; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[VEC_IND]], splat (i32 -1)
-; RV64-NEXT:    [[TMP22:%.*]] = zext <vscale x 4 x i32> [[TMP21]] to <vscale x 4 x i64>
-; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], <vscale x 4 x i64> [[TMP22]]
-; RV64-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 4 x ptr> [[TMP23]], i32 0
+; RV64-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
+; RV64-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0
+; RV64-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1
+; RV64-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
+; RV64-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 0
 ; RV64-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
 ; RV64-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP27]], 4
-; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP24]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP26]])
+; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP34]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP26]])
 ; RV64-NEXT:    [[TMP25:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[A]], <vscale x 4 x i64> [[TMP22]]
-; RV64-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 4 x ptr> [[TMP28]], i32 0
+; RV64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
+; RV64-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0
 ; RV64-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
 ; RV64-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP30]], 4
-; RV64-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP25]], ptr align 4 [[TMP29]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP33]])
+; RV64-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP25]], ptr align 4 [[TMP35]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP33]])
 ; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
-; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT2]]
 ; RV64-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV64-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; RV64:       [[MIDDLE_BLOCK]]:
@@ -384,7 +350,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; RV64:       [[SCALAR_PH]]:
 ; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
-; RV64-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
 ; RV64-NEXT:    br label %[[FOR_BODY:.*]]
 ; RV64:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
 ; RV64-NEXT:    br label %[[FOR_COND_CLEANUP]]
@@ -392,7 +358,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-NEXT:    ret void
 ; RV64:       [[FOR_BODY]]:
 ; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
 ; RV64-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
 ; RV64-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
 ; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
@@ -425,34 +391,26 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV32-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
 ; RV32-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
-; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[N]], i64 0
-; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; RV32-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; RV32-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 -1)
-; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> [[DOTSPLAT]], [[TMP10]]
-; RV32-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP6]] to i32
-; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP13]]
-; RV32-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP12]], i64 0
-; RV32-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV32:       [[VECTOR_BODY]]:
 ; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[VEC_IND]], splat (i32 -1)
-; RV32-NEXT:    [[TMP14:%.*]] = zext <vscale x 4 x i32> [[TMP19]] to <vscale x 4 x i64>
-; RV32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], <vscale x 4 x i64> [[TMP14]]
-; RV32-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x ptr> [[TMP15]], i32 0
+; RV32-NEXT:    [[DOTCAST2:%.*]] = trunc i64 [[INDEX]] to i32
+; RV32-NEXT:    [[OFFSET_IDX1:%.*]] = sub i32 [[N]], [[DOTCAST2]]
+; RV32-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX1]], 0
+; RV32-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
+; RV32-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
+; RV32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0
 ; RV32-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
 ; RV32-NEXT:    [[TMP24:%.*]] = mul i32 [[TMP17]], 4
-; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr align 4 [[TMP16]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP24]])
+; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr align 4 [[TMP27]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP24]])
 ; RV32-NEXT:    [[TMP18:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], <vscale x 4 x i64> [[TMP14]]
-; RV32-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x ptr> [[TMP20]], i32 0
+; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
+; RV32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 0
 ; RV32-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
 ; RV32-NEXT:    [[TMP23:%.*]] = mul i32 [[TMP22]], 4
-; RV32-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i32(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP21]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP23]])
+; RV32-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i32(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP28]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP23]])
 ; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT2]]
 ; RV32-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV32-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; RV32:       [[MIDDLE_BLOCK]]:
@@ -460,7 +418,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV32-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; RV32:       [[SCALAR_PH]]:
 ; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
-; RV32-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
+; RV32-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ]
 ; RV32-NEXT:    br label %[[FOR_BODY:.*]]
 ; RV32:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
 ; RV32-NEXT:    br label %[[FOR_COND_CLEANUP]]
@@ -468,7 +426,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV32-NEXT:    ret void
 ; RV32:       [[FOR_BODY]]:
 ; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
 ; RV32-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
 ; RV32-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
 ; RV32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
@@ -510,53 +468,43 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP12]]
 ; RV64-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; RV64-UF2-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
-; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 8
 ; RV64-UF2-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
 ; RV64-UF2-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST1]]
-; RV64-UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP14]], i64 0
-; RV64-UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; RV64-UF2-NEXT:    [[TMP18:%.*]] = trunc <vscale x 4 x i64> [[BROADCAST_SPLAT]] to <vscale x 4 x i32>
-; RV64-UF2-NEXT:    [[TMP19:%.*]] = mul <vscale x 4 x i32> [[TMP18]], splat (i32 -1)
-; RV64-UF2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[N]], i64 0
-; RV64-UF2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; RV64-UF2-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; RV64-UF2-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP20]], splat (i32 -1)
-; RV64-UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> [[DOTSPLAT]], [[TMP21]]
 ; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV64-UF2:       [[VECTOR_BODY]]:
 ; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i32> [[VEC_IND]], [[TMP19]]
-; RV64-UF2-NEXT:    [[TMP22:%.*]] = add nsw <vscale x 4 x i32> [[VEC_IND]], splat (i32 -1)
-; RV64-UF2-NEXT:    [[TMP23:%.*]] = add nsw <vscale x 4 x i32> [[STEP_ADD]], splat (i32 -1)
-; RV64-UF2-NEXT:    [[TMP24:%.*]] = zext <vscale x 4 x i32> [[TMP22]] to <vscale x 4 x i64>
-; RV64-UF2-NEXT:    [[TMP25:%.*]] = zext <vscale x 4 x i32> [[TMP23]] to <vscale x 4 x i64>
-; RV64-UF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[B]], <vscale x 4 x i64> [[TMP24]]
-; RV64-UF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[B]], <vscale x 4 x i64> [[TMP25]]
-; RV64-UF2-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x ptr> [[TMP26]], i32 0
+; RV64-UF2-NEXT:    [[DOTCAST2:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-UF2-NEXT:    [[OFFSET_IDX1:%.*]] = sub i32 [[N]], [[DOTCAST2]]
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX1]], 0
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP17]], -1
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i32 0
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP36]], 4294967292
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP46]]
 ; RV64-UF2-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
 ; RV64-UF2-NEXT:    [[TMP34:%.*]] = mul i32 [[TMP29]], 4
 ; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP28]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP34]])
-; RV64-UF2-NEXT:    [[TMP35:%.*]] = extractelement <vscale x 4 x ptr> [[TMP27]], i32 0
 ; RV64-UF2-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vscale.i32()
 ; RV64-UF2-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP32]], 4
 ; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP35]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP33]])
 ; RV64-UF2-NEXT:    [[TMP30:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
 ; RV64-UF2-NEXT:    [[TMP31:%.*]] = fadd <vscale x 4 x float> [[REVERSE3]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[A]], <vscale x 4 x i64> [[TMP24]]
-; RV64-UF2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[A]], <vscale x 4 x i64> [[TMP25]]
-; RV64-UF2-NEXT:    [[TMP38:%.*]] = extractelement <vscale x 4 x ptr> [[TMP36]], i32 0
+; RV64-UF2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
+; RV64-UF2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0
+; RV64-UF2-NEXT:    [[TMP48:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP49:%.*]] = mul i64 [[TMP48]], 4294967292
+; RV64-UF2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP49]]
 ; RV64-UF2-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vscale.i32()
 ; RV64-UF2-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], 4
 ; RV64-UF2-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP30]], ptr align 4 [[TMP38]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP40]])
-; RV64-UF2-NEXT:    [[TMP44:%.*]] = extractelement <vscale x 4 x ptr> [[TMP37]], i32 0
 ; RV64-UF2-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vscale.i32()
 ; RV64-UF2-NEXT:    [[TMP43:%.*]] = mul i32 [[TMP45]], 4
 ; RV64-UF2-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i64(<vscale x 4 x float> [[TMP31]], ptr align 4 [[TMP44]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]])
 ; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; RV64-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[STEP_ADD]], [[TMP19]]
 ; RV64-UF2-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV64-UF2-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; RV64-UF2:       [[MIDDLE_BLOCK]]:
@@ -564,7 +512,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; RV64-UF2:       [[SCALAR_PH]]:
 ; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
-; RV64-UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ]
 ; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
 ; RV64-UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
 ; RV64-UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
@@ -572,7 +520,7 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; RV64-UF2-NEXT:    ret void
 ; RV64-UF2:       [[FOR_BODY]]:
 ; RV64-UF2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT:    [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[I_0:%.*]], %[[FOR_BODY]] ]
 ; RV64-UF2-NEXT:    [[I_0]] = add nsw i32 [[I_0_IN8]], -1
 ; RV64-UF2-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
 ; RV64-UF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 8c2d6cc8fb47e..1f94d03d170a9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -24,12 +24,18 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
 ; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 4294967295.
 ; CHECK-NEXT:  LV: Found uniform instruction:   %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:  LV: Found uniform instruction:   %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+; CHECK-NEXT:  LV: Found uniform instruction:   %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+; CHECK-NEXT:  LV: Found uniform instruction:   %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:  LV: Found uniform instruction:   %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found uniform instruction:   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
 ; CHECK-NEXT:  LV: Found uniform instruction:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:  LV: Found uniform instruction:   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:  LV: Found uniform instruction:   %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %1 = load i32, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %add9 = add i32 %1, 1
@@ -47,7 +53,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
 ; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
 ; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
@@ -65,14 +70,17 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
 ; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:      ir<%i.0.in8> = WIDEN-INDUCTION  ir<%n>, ir<-1>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN ir<%i.0> = add nsw ir<%i.0.in8>, ir<-1>
-; CHECK-NEXT:      WIDEN-CAST ir<%idxprom> = zext ir<%i.0> to i64
-; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      WIDEN ir<%1> = load ir<%arrayidx>
-; CHECK-NEXT:      WIDEN ir<%add9> = add ir<%1>, ir<1>
-; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      WIDEN store ir<%arrayidx3>, ir<%add9>
+; CHECK-NEXT:      vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT:      vp<[[SCALAR_STEP:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>
+; CHECK-NEXT:      CLONE ir<[[IDX:%.+]]> = add nsw vp<[[SCALAR_STEP]]>, ir<-1>
+; CHECK-NEXT:      CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT:      CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT:      vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT:      WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT:      WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
+; CHECK-NEXT:      CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT:      vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT:      WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
 ; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
@@ -100,8 +108,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %1 = load i32, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %add9 = add i32 %1, 1
@@ -123,14 +131,14 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): At #10 Interval # 2
 ; CHECK-NEXT:  LV(REG): VF = vscale x 4
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 26
+; CHECK-NEXT:  LV: Loop cost is 22
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
@@ -139,8 +147,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK:       Executing best plan with VF=vscale x 4, UF=1
 ; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT:  Live-in ir<[[VF:%.+]]> = VF
-; CHECK-NEXT:  Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF
+; CHECK-NEXT:  Live-in ir<[[VFxUF:%.+]]> = VF * UF
 ; CHECK-NEXT:  Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
 ; CHECK-EMPTY:
@@ -185,15 +192,18 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
 ; CHECK-NEXT:      SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:      ir<%i.0.in8> = WIDEN-INDUCTION  ir<%n>, ir<-1>, ir<%18>
-; CHECK-NEXT:      WIDEN ir<%i.0> = add nsw ir<%i.0.in8>, ir<-1>
-; CHECK-NEXT:      WIDEN-CAST ir<%idxprom> = zext ir<%i.0> to i64
-; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      WIDEN ir<[[LD:%.+]]> = load ir<%arrayidx>
-; CHECK-NEXT:      WIDEN ir<%add9> = add ir<[[LD]]>, ir<1>
-; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      WIDEN store ir<%arrayidx3>, ir<%add9>
-; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
+; CHECK-NEXT:      vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT:      vp<[[SCALAR_STEP:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>
+; CHECK-NEXT:      CLONE ir<[[IDX:%.+]]> = add nsw vp<[[SCALAR_STEP]]>, ir<-1>
+; CHECK-NEXT:      CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT:      CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT:      vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT:      WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT:      WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1>
+; CHECK-NEXT:      CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT:      vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT:      WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
+; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
@@ -261,12 +271,18 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
 ; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 4294967295.
 ; CHECK-NEXT:  LV: Found uniform instruction:   %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:  LV: Found uniform instruction:   %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+; CHECK-NEXT:  LV: Found uniform instruction:   %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+; CHECK-NEXT:  LV: Found uniform instruction:   %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:  LV: Found uniform instruction:   %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found uniform instruction:   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
 ; CHECK-NEXT:  LV: Found uniform instruction:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:  LV: Found uniform instruction:   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:  LV: Found uniform instruction:   %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %1 = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %conv1 = fadd float %1, 1.000000e+00
@@ -284,7 +300,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
 ; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
 ; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
@@ -302,14 +317,17 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
 ; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:      ir<%i.0.in8> = WIDEN-INDUCTION  ir<%n>, ir<-1>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN ir<%i.0> = add nsw ir<%i.0.in8>, ir<-1>
-; CHECK-NEXT:      WIDEN-CAST ir<%idxprom> = zext ir<%i.0> to i64
-; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      WIDEN ir<%1> = load ir<%arrayidx>
-; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
-; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      WIDEN store ir<%arrayidx3>, ir<%conv1>
+; CHECK-NEXT:      vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT:      vp<[[SCALAR_STEP:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>
+; CHECK-NEXT:      CLONE ir<[[IDX:%.+]]> = add nsw vp<[[SCALAR_STEP]]>, ir<-1>
+; CHECK-NEXT:      CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT:      CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT:      vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT:      WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT:      WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
+; CHECK-NEXT:      CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT:      vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT:      WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
 ; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
@@ -337,8 +355,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %1 = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %conv1 = fadd float %1, 1.000000e+00
@@ -346,7 +364,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 8 for VF vscale x 4 For instruction:   store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
+; CHECK-NEXT:LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV(REG): Calculating max register usage:
 ; CHECK-NEXT:  LV(REG): At #0 Interval # 0
 ; CHECK-NEXT:  LV(REG): At #1 Interval # 1
@@ -360,14 +378,14 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): At #10 Interval # 2
 ; CHECK-NEXT:  LV(REG): VF = vscale x 4
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 28
+; CHECK-NEXT:  LV: Loop cost is 24
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
@@ -376,8 +394,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK:       Executing best plan with VF=vscale x 4, UF=1
 ; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT:  Live-in ir<[[VF:%.+]]> = VF
-; CHECK-NEXT:  Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF
+; CHECK-NEXT:  Live-in ir<[[VFxUF:%.+]]> = VF * UF
 ; CHECK-NEXT:  Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
 ; CHECK-EMPTY:
@@ -422,15 +439,18 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
 ; CHECK-NEXT:      SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:      ir<%i.0.in8> = WIDEN-INDUCTION  ir<%n>, ir<-1>, ir<%18>
-; CHECK-NEXT:      WIDEN ir<%i.0> = add nsw ir<%i.0.in8>, ir<-1>
-; CHECK-NEXT:      WIDEN-CAST ir<%idxprom> = zext ir<%i.0> to i64
-; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      WIDEN ir<[[L:%.+]]> = load ir<%arrayidx>
-; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00>
-; CHECK-NEXT:      WIDEN-GEP Inv[Var] ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      WIDEN store ir<%arrayidx3>, ir<%conv1>
-; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
+; CHECK-NEXT:      vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT:      vp<[[SCALAR_STEP:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>
+; CHECK-NEXT:      CLONE ir<[[IDX:%.+]]> = add nsw vp<[[SCALAR_STEP]]>, ir<-1>
+; CHECK-NEXT:      CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT:      CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT:      vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
+; CHECK-NEXT:      WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>
+; CHECK-NEXT:      WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00>
+; CHECK-NEXT:      CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]>
+; CHECK-NEXT:      vp<[[ST_PTR:%.+]]> = vector-pointer ir<[[ST_IDX]]>
+; CHECK-NEXT:      WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]>
+; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 622f206743a7c..6f68e0748becc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -12,18 +12,55 @@
 define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %ptr2) {
 ; IF-EVL-LABEL: @reverse_load_store(
 ; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], -1
+; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP9]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; IF-EVL-NEXT:    [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP11]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP9]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], ptr align 4 [[TMP13]], i64 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
 ; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL:%.*]], [[SCALAR_PH:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY1]] ]
 ; IF-EVL-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
-; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]]
 ; IF-EVL-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
-; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
 ; IF-EVL-NEXT:    store i32 [[TMP]], ptr [[GEPS]], align 4
 ; IF-EVL-NEXT:    [[INC]] = add i32 [[I]], 1
 ; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
-; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND:%.*]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY1]], label [[LOOPEND]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL:       loopend:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -66,25 +103,75 @@ loopend:
 define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noalias %ptr1, ptr noalias %ptr2) {
 ; IF-EVL-LABEL: @reverse_load_store_masked(
 ; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT:    [[OFFSET_IDX1:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX1]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], splat (i64 1023)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[TMP8]], -1
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP9]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
+; IF-EVL-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; IF-EVL-NEXT:    [[WIDE_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP19]], i64 -4, <vscale x 4 x i1> [[TMP17]], i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0
+; IF-EVL-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i64(<vscale x 4 x i32> [[WIDE_STRIDED_LOAD]], ptr align 4 [[TMP21]], i64 -4, <vscale x 4 x i1> [[TMP17]], i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
 ; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL:%.*]], [[SCALAR_PH:%.*]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
-; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC]] ]
+; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
+; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC]] ]
 ; IF-EVL-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
-; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[I]]
+; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i32 [[I]]
 ; IF-EVL-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
 ; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP]], 100
 ; IF-EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; IF-EVL:       if.then:
-; IF-EVL-NEXT:    [[GEPL1:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[GEPL1:%.*]] = getelementptr inbounds i32, ptr [[PTR1]], i64 [[ADD]]
 ; IF-EVL-NEXT:    [[V:%.*]] = load i32, ptr [[GEPL1]], align 4
-; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
 ; IF-EVL-NEXT:    store i32 [[V]], ptr [[GEPS]], align 4
 ; IF-EVL-NEXT:    br label [[FOR_INC]]
 ; IF-EVL:       for.inc:
 ; IF-EVL-NEXT:    [[INC]] = add i32 [[I]], 1
 ; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
-; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND:%.*]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY1]], label [[LOOPEND]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       loopend:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -146,20 +233,74 @@ loopend:
 define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d) {
 ; IF-EVL-LABEL: @multiple_reverse_vector_pointer(
 ; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 1024, [[N_VEC]]
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP7]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP9]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 1, [[TMP9]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP7]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[TMP17:%.*]] = mul i64 0, [[TMP16]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = sub i64 1, [[TMP16]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
+; IF-EVL-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP7]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 0, [[TMP22]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = sub i64 1, [[TMP22]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
+; IF-EVL-NEXT:    [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE2]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 1024, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[LOOP1:%.*]]
 ; IF-EVL:       loop:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; IF-EVL-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP1]] ]
+; IF-EVL-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
 ; IF-EVL-NEXT:    [[X:%.*]] = load i8, ptr [[GEP_A]], align 1
-; IF-EVL-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B:%.*]], i8 [[X]]
+; IF-EVL-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i8 [[X]]
 ; IF-EVL-NEXT:    [[Y:%.*]] = load i8, ptr [[GEP_B]], align 1
-; IF-EVL-NEXT:    [[GEP_C:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[GEP_C:%.*]] = getelementptr i8, ptr [[C]], i64 [[IV]]
 ; IF-EVL-NEXT:    store i8 [[Y]], ptr [[GEP_C]], align 1
-; IF-EVL-NEXT:    [[GEP_D:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[GEP_D:%.*]] = getelementptr i8, ptr [[D]], i64 [[IV]]
 ; IF-EVL-NEXT:    store i8 [[Y]], ptr [[GEP_D]], align 1
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
 ; IF-EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 0
-; IF-EVL-NEXT:    br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; IF-EVL-NEXT:    br i1 [[CMP_NOT]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 86bf440657f5b..3e1d70bfc2cb4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -12,15 +12,52 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 3, [[SPEC_SELECT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 -1, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub i64 [[TMP0]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = sub nuw nsw i64 1, [[IV]]
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP23]]
-; CHECK-NEXT:    store i64 0, ptr [[ARRAYIDX13]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i32 0
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i64(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[TMP15]], i64 -8, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = sub nuw nsw i64 1, [[IV1]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP18]]
+; CHECK-NEXT:    store i64 0, ptr [[ARRAYIDX14]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -41,3 +78,9 @@ loop:
 exit:
   ret void
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.