[llvm] [RISCV] Set AllocationPriority in line with LMUL (PR #131176)

Philip Reames via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 18 08:15:55 PDT 2025


https://github.com/preames updated https://github.com/llvm/llvm-project/pull/131176

>From 39fd13ebacde660b0b1b426691038d3430b1efad Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 13 Mar 2025 10:00:11 -0700
Subject: [PATCH 1/2] [RISCV] Set AllocationPriority in line with LMUL

This mechanism causes the greedy register allocator to prefer allocating
register classes with higher priority first.  This helps to ensure that
high LMUL registers obtain a register without having to go through the
eviction mechanism.  In practice, it seems to cause a bunch of code
churn, and some minor improvement around widening and narrowing
operations.

In a few of the widening tests, we have what look like code size
regressions because we end up with two smaller register class copies
instead of one larger one after the instruction.  However, in any
larger code sequence, these are likely to be folded into the producing
instructions.  (But so were the wider copies after the operation.)

Two observations:
1) We're not setting the greedy-regclass-priority-trumps-globalness flag
   on the register class, so this doesn't help long mask ranges.  I
   thought about doing that, but the benefit is non-obvious, so I
   decided it was worth a separate change at minimum.
2) We could arguably set the priority higher for the register classes
   that exclude v0.  I tried that, and it caused a whole bunch of
   further churn.  I may return to it in a separate patch.
---
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td    |    6 +
 .../redundant-copy-from-tail-duplicate.ll     |    6 +-
 .../CodeGen/RISCV/rvv/active_lane_mask.ll     |   12 +-
 .../RISCV/rvv/combine-store-extract-crash.ll  |   40 +-
 .../RISCV/rvv/common-shuffle-patterns.ll      |    7 +-
 llvm/test/CodeGen/RISCV/rvv/compressstore.ll  |   50 +-
 llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll    |  136 +-
 llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll        |   92 +-
 llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll    |  206 +--
 llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll        |  130 +-
 llvm/test/CodeGen/RISCV/rvv/expandload.ll     | 1282 ++++++++-------
 .../CodeGen/RISCV/rvv/fixed-vectors-abs.ll    |   18 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll   |  188 +--
 .../CodeGen/RISCV/rvv/fixed-vectors-cttz.ll   |  232 +--
 .../RISCV/rvv/fixed-vectors-fp-conv.ll        |   12 +-
 .../RISCV/rvv/fixed-vectors-fp-interleave.ll  |   66 +-
 .../RISCV/rvv/fixed-vectors-fp-setcc.ll       |  216 +--
 .../RISCV/rvv/fixed-vectors-fp-shuffles.ll    |    6 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll     | 1456 ++++++++---------
 .../RISCV/rvv/fixed-vectors-fp2i-sat.ll       |   16 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll   |   12 +-
 .../RISCV/rvv/fixed-vectors-fpext-vp.ll       |   15 +-
 .../RISCV/rvv/fixed-vectors-fptosi-vp.ll      |    8 +-
 .../RISCV/rvv/fixed-vectors-fptoui-vp.ll      |    8 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll   |   12 +-
 .../rvv/fixed-vectors-insert-subvector.ll     |   24 +-
 .../RISCV/rvv/fixed-vectors-int-buildvec.ll   |   66 +-
 .../RISCV/rvv/fixed-vectors-int-exttrunc.ll   |   12 +-
 .../RISCV/rvv/fixed-vectors-int-interleave.ll |   81 +-
 .../RISCV/rvv/fixed-vectors-int-shuffles.ll   |   46 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-int.ll    |   80 +-
 .../rvv/fixed-vectors-interleaved-access.ll   |  483 +++---
 .../rvv/fixed-vectors-inttoptr-ptrtoint.ll    |    4 +-
 .../RISCV/rvv/fixed-vectors-llrint-vp.ll      |   32 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-llrint.ll |   44 +-
 .../RISCV/rvv/fixed-vectors-lrint-vp.ll       |   16 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-lrint.ll  |   76 +-
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  |    8 +-
 .../RISCV/rvv/fixed-vectors-masked-scatter.ll |    8 +-
 .../RISCV/rvv/fixed-vectors-reduction-fp.ll   |   16 +-
 .../RISCV/rvv/fixed-vectors-reduction-int.ll  |   16 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-sad.ll    |   28 +-
 .../RISCV/rvv/fixed-vectors-scalarized.ll     |   28 +-
 .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll    |  532 +++---
 .../RISCV/rvv/fixed-vectors-sext-vp.ll        |   35 +-
 .../fixed-vectors-shuffle-changes-length.ll   |   50 +-
 .../rvv/fixed-vectors-shuffle-exact-vlen.ll   |   38 +-
 .../rvv/fixed-vectors-shuffle-reverse.ll      |  276 ++--
 .../RISCV/rvv/fixed-vectors-shuffle-rotate.ll |   42 +-
 .../rvv/fixed-vectors-shufflevector-vnsrl.ll  |    6 +-
 .../RISCV/rvv/fixed-vectors-sitofp-vp.ll      |    8 +-
 .../RISCV/rvv/fixed-vectors-uitofp-vp.ll      |    8 +-
 .../RISCV/rvv/fixed-vectors-unaligned.ll      |    8 +-
 .../RISCV/rvv/fixed-vectors-vfadd-vp.ll       |   24 +-
 .../RISCV/rvv/fixed-vectors-vfdiv-vp.ll       |   24 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll  |   24 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll  |   24 +-
 .../RISCV/rvv/fixed-vectors-vfmul-vp.ll       |   24 +-
 ...fixed-vectors-vfpext-constrained-sdnode.ll |   16 +-
 ...fixed-vectors-vfptoi-constrained-sdnode.ll |   32 +-
 .../RISCV/rvv/fixed-vectors-vfsub-vp.ll       |   24 +-
 .../RISCV/rvv/fixed-vectors-vfwmacc.ll        |   36 +-
 ...fixed-vectors-vitofp-constrained-sdnode.ll |   50 +-
 .../RISCV/rvv/fixed-vectors-vpgather.ll       |   48 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vrol.ll   |   12 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vror.ll   |   76 +-
 .../RISCV/rvv/fixed-vectors-vwadd-mask.ll     |   31 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll |    9 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll  |  288 ++--
 .../RISCV/rvv/fixed-vectors-vwsub-mask.ll     |   24 +-
 .../RISCV/rvv/fixed-vectors-zext-vp.ll        |   35 +-
 .../CodeGen/RISCV/rvv/float-round-conv.ll     |   56 +-
 llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll    |    8 +-
 llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll    |    8 +-
 llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll     |   12 +-
 llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll     |   12 +-
 .../test/CodeGen/RISCV/rvv/half-round-conv.ll |   28 +-
 .../CodeGen/RISCV/rvv/interleave-crash.ll     |   40 +-
 .../RISCV/rvv/intrinsic-vector-match.ll       |  180 +-
 llvm/test/CodeGen/RISCV/rvv/llrint-sdnode.ll  |   12 +-
 llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll      |   12 +-
 llvm/test/CodeGen/RISCV/rvv/lrint-sdnode.ll   |   12 +-
 llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll       |   12 +-
 llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll |   44 +-
 .../RISCV/rvv/named-vector-shuffle-reverse.ll |  280 ++--
 .../CodeGen/RISCV/rvv/narrow-shift-extend.ll  |    6 +-
 llvm/test/CodeGen/RISCV/rvv/pr61561.ll        |   16 +-
 llvm/test/CodeGen/RISCV/rvv/pr95865.ll        |   10 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll    |  130 +-
 .../CodeGen/RISCV/rvv/sink-splat-operands.ll  |   12 +-
 .../CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll   |   16 +-
 .../RISCV/rvv/vector-deinterleave-fixed.ll    |   72 +-
 .../RISCV/rvv/vector-deinterleave-load.ll     |   14 +-
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  |  116 +-
 .../RISCV/rvv/vector-interleave-fixed.ll      |  108 +-
 .../RISCV/rvv/vector-interleave-store.ll      |   28 +-
 .../CodeGen/RISCV/rvv/vector-interleave.ll    |  770 ++++-----
 llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll   |  144 +-
 .../RISCV/rvv/vfadd-constrained-sdnode.ll     |   36 +-
 llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll       |   72 +-
 .../CodeGen/RISCV/rvv/vfcopysign-sdnode.ll    |   12 +-
 .../RISCV/rvv/vfdiv-constrained-sdnode.ll     |   48 +-
 llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll       |   72 +-
 llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll        | 1196 ++++++--------
 .../RISCV/rvv/vfmsub-constrained-sdnode.ll    |   82 +-
 .../RISCV/rvv/vfmul-constrained-sdnode.ll     |   36 +-
 llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll       |   36 +-
 .../RISCV/rvv/vfnmadd-constrained-sdnode.ll   |   70 +-
 .../RISCV/rvv/vfnmsub-constrained-sdnode.ll   |   34 +-
 .../RISCV/rvv/vfpext-constrained-sdnode.ll    |   28 +-
 llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll  |   36 +-
 llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll      |   12 +-
 .../RISCV/rvv/vfptoi-constrained-sdnode.ll    |   48 +-
 llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll  |   48 +-
 llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll     |   19 +-
 llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll     |   19 +-
 llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll    |   34 +-
 .../RISCV/rvv/vfsub-constrained-sdnode.ll     |   48 +-
 llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll       |   72 +-
 llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll  |   39 +-
 llvm/test/CodeGen/RISCV/rvv/vfwadd.ll         |  102 +-
 llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll       |   40 +-
 llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-f.ll     |   24 +-
 llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-x.ll     |   36 +-
 llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-xu.ll    |   36 +-
 llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-x-f.ll |   24 +-
 .../test/CodeGen/RISCV/rvv/vfwcvt-rtz-xu-f.ll |   24 +-
 llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll     |   48 +-
 llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll    |   48 +-
 llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll     |   78 +-
 llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll     |   80 +-
 llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll  |   39 +-
 llvm/test/CodeGen/RISCV/rvv/vfwmul.ll         |  102 +-
 llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll    |  114 +-
 llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll    |  114 +-
 llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll  |   39 +-
 llvm/test/CodeGen/RISCV/rvv/vfwsub.ll         |  102 +-
 llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll       |   40 +-
 .../RISCV/rvv/vitofp-constrained-sdnode.ll    |   72 +-
 llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll  |  112 +-
 llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll  |   28 +-
 llvm/test/CodeGen/RISCV/rvv/vloxei.ll         |  144 +-
 llvm/test/CodeGen/RISCV/rvv/vluxei.ll         |  144 +-
 .../CodeGen/RISCV/rvv/vp-inttoptr-ptrtoint.ll |   12 +-
 .../RISCV/rvv/vp-vector-interleaved-access.ll |   20 +-
 .../test/CodeGen/RISCV/rvv/vpgather-sdnode.ll |   36 +-
 llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll |   10 +-
 llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll    |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll    |   18 +-
 .../RISCV/rvv/vscale-vw-web-simplification.ll |   64 +-
 llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll       |   24 +-
 llvm/test/CodeGen/RISCV/rvv/vsext.ll          |   72 +-
 llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll     |   19 +-
 llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll      |   23 +-
 llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll     |   19 +-
 .../CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll    |   31 +-
 llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll   |  110 +-
 llvm/test/CodeGen/RISCV/rvv/vwadd.ll          |   81 +-
 llvm/test/CodeGen/RISCV/rvv/vwadd.w.ll        |   32 +-
 llvm/test/CodeGen/RISCV/rvv/vwaddu.ll         |   81 +-
 llvm/test/CodeGen/RISCV/rvv/vwaddu.w.ll       |   32 +-
 llvm/test/CodeGen/RISCV/rvv/vwmul-sdnode.ll   |   81 +-
 llvm/test/CodeGen/RISCV/rvv/vwmul.ll          |   81 +-
 llvm/test/CodeGen/RISCV/rvv/vwmulsu.ll        |   81 +-
 llvm/test/CodeGen/RISCV/rvv/vwmulu.ll         |   81 +-
 llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll   |  258 +--
 llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll       |  216 +--
 llvm/test/CodeGen/RISCV/rvv/vwsll.ll          |  117 +-
 .../CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll    |   24 +-
 llvm/test/CodeGen/RISCV/rvv/vwsub-sdnode.ll   |  102 +-
 llvm/test/CodeGen/RISCV/rvv/vwsub.ll          |   81 +-
 llvm/test/CodeGen/RISCV/rvv/vwsub.w.ll        |   32 +-
 llvm/test/CodeGen/RISCV/rvv/vwsubu.ll         |   81 +-
 llvm/test/CodeGen/RISCV/rvv/vwsubu.w.ll       |   32 +-
 llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll       |   24 +-
 llvm/test/CodeGen/RISCV/rvv/vzext.ll          |   72 +-
 .../CodeGen/RISCV/rvv/zvbb-demanded-bits.ll   |    6 +-
 .../CodeGen/RISCV/srem-seteq-illegal-types.ll |   26 +-
 179 files changed, 7579 insertions(+), 7479 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index a5dfb5ba1a2fc..1e0541e667895 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -752,18 +752,24 @@ def VR : VReg<!listconcat(VM1VTs, VMaskVTs),
 
 def VRNoV0 : VReg<!listconcat(VM1VTs, VMaskVTs), (sub VR, V0), 1>;
 
+let AllocationPriority = 2 in
 def VRM2 : VReg<VM2VTs, (add (sequence "V%uM2", 8, 31, 2),
                              (sequence "V%uM2", 6, 0, 2)), 2>;
 
+let AllocationPriority = 2 in
 def VRM2NoV0 : VReg<VM2VTs, (sub VRM2, V0M2), 2>;
 
+let AllocationPriority = 4 in
 def VRM4 : VReg<VM4VTs, (add V8M4, V12M4, V16M4, V20M4,
                              V24M4, V28M4, V4M4, V0M4), 4>;
 
+let AllocationPriority = 4 in
 def VRM4NoV0 : VReg<VM4VTs, (sub VRM4, V0M4), 4>;
 
+let AllocationPriority = 8 in
 def VRM8 : VReg<VM8VTs, (add V8M8, V16M8, V24M8, V0M8), 8>;
 
+let AllocationPriority = 8 in
 def VRM8NoV0 : VReg<VM8VTs, (sub VRM8, V0M8), 8>;
 
 def VMV0 : VReg<VMaskVTs, (add V0), 1>;
diff --git a/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll b/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
index 5d588ad66b9ca..15b5698c22e81 100644
--- a/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
+++ b/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
@@ -20,10 +20,10 @@ define signext i32 @sum(ptr %a, i32 signext %n, i1 %prof.min.iters.check, <vscal
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_4: # %vector.ph
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m4, ta, ma
-; CHECK-NEXT:    vredsum.vs v8, v12, v8, v0.t
+; CHECK-NEXT:    vredsum.vs v8, v8, v12, v0.t
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 4ade6c09fe43d..ec422a8fbb928 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -106,12 +106,12 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    lui a0, %hi(.LCPI8_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI8_0)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v0, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v0, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v16
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
 ; CHECK-NEXT:    vmsltu.vx v16, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v0, v16, 2
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
index 482cf83d540c4..496755738e6fa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
@@ -9,21 +9,21 @@ define void @test(ptr %ref_array, ptr %sad_array) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    th.lwd a2, a3, (a0), 0, 3
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; RV32-NEXT:    vle8.v v8, (a2)
+; RV32-NEXT:    vle8.v v12, (a2)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf4 v12, v8
-; RV32-NEXT:    vmv.s.x v8, zero
-; RV32-NEXT:    vredsum.vs v9, v12, v8
-; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    vzext.vf4 v8, v12
+; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    vredsum.vs v8, v8, v12
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    th.swia a0, (a1), 4, 0
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; RV32-NEXT:    vle8.v v9, (a3)
-; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vle8.v v13, (a3)
+; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vi v9, v10, 4
+; RV32-NEXT:    vslideup.vi v13, v8, 4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf4 v12, v9
-; RV32-NEXT:    vredsum.vs v8, v12, v8
+; RV32-NEXT:    vzext.vf4 v8, v13
+; RV32-NEXT:    vredsum.vs v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    ret
@@ -32,21 +32,21 @@ define void @test(ptr %ref_array, ptr %sad_array) {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    th.ldd a2, a3, (a0), 0, 4
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; RV64-NEXT:    vle8.v v8, (a2)
+; RV64-NEXT:    vle8.v v12, (a2)
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf4 v12, v8
-; RV64-NEXT:    vmv.s.x v8, zero
-; RV64-NEXT:    vredsum.vs v9, v12, v8
-; RV64-NEXT:    vmv.x.s a0, v9
+; RV64-NEXT:    vzext.vf4 v8, v12
+; RV64-NEXT:    vmv.s.x v12, zero
+; RV64-NEXT:    vredsum.vs v8, v8, v12
+; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    th.swia a0, (a1), 4, 0
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; RV64-NEXT:    vle8.v v9, (a3)
-; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    vle8.v v13, (a3)
+; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vi v9, v10, 4
+; RV64-NEXT:    vslideup.vi v13, v8, 4
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf4 v12, v9
-; RV64-NEXT:    vredsum.vs v8, v12, v8
+; RV64-NEXT:    vzext.vf4 v8, v13
+; RV64-NEXT:    vredsum.vs v8, v8, v12
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-NEXT:    vse32.v v8, (a1)
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll b/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll
index 1845c0e4bd3b6..7649d9ad6059f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll
@@ -8,10 +8,11 @@ define dso_local <16 x i16> @interleave(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: interleave:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwaddu.vv v8, v11, v10
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v10
 ; CHECK-NEXT:    ret
 entry:
   %v2 = shufflevector <8 x i16> %v0, <8 x i16> poison, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
diff --git a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
index 61fb457a7eb65..69822e9d9d2e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
@@ -200,12 +200,12 @@ define void @test_compresstore_v256i8(ptr %p, <256 x i1> %mask, <256 x i8> %data
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vmv1r.v v7, v8
 ; RV64-NEXT:    li a2, 128
-; RV64-NEXT:    vslidedown.vi v9, v0, 1
+; RV64-NEXT:    vslidedown.vi v8, v0, 1
 ; RV64-NEXT:    vmv.x.s a3, v0
 ; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV64-NEXT:    vle8.v v24, (a1)
 ; RV64-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v9
+; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV64-NEXT:    vcompress.vm v8, v16, v0
 ; RV64-NEXT:    vcpop.m a4, v0
@@ -227,14 +227,14 @@ define void @test_compresstore_v256i8(ptr %p, <256 x i1> %mask, <256 x i8> %data
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv1r.v v7, v8
 ; RV32-NEXT:    li a2, 128
-; RV32-NEXT:    vslidedown.vi v9, v0, 1
+; RV32-NEXT:    vslidedown.vi v8, v0, 1
 ; RV32-NEXT:    li a3, 32
 ; RV32-NEXT:    vmv.x.s a4, v0
 ; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV32-NEXT:    vle8.v v24, (a1)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v6, v9, a3
-; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vsrl.vx v6, v8, a3
+; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    vsrl.vx v5, v0, a3
 ; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV32-NEXT:    vcompress.vm v8, v16, v0
@@ -438,16 +438,16 @@ define void @test_compresstore_v128i16(ptr %p, <128 x i1> %mask, <128 x i16> %da
 ; RV64-NEXT:    vcompress.vm v24, v8, v0
 ; RV64-NEXT:    vcpop.m a2, v0
 ; RV64-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v0, 8
+; RV64-NEXT:    vslidedown.vi v7, v0, 8
 ; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; RV64-NEXT:    vcompress.vm v0, v16, v8
-; RV64-NEXT:    vcpop.m a1, v8
+; RV64-NEXT:    vcompress.vm v8, v16, v7
+; RV64-NEXT:    vcpop.m a1, v7
 ; RV64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
 ; RV64-NEXT:    vse16.v v24, (a0)
 ; RV64-NEXT:    slli a2, a2, 1
 ; RV64-NEXT:    add a0, a0, a2
 ; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; RV64-NEXT:    vse16.v v0, (a0)
+; RV64-NEXT:    vse16.v v8, (a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_compresstore_v128i16:
@@ -635,16 +635,16 @@ define void @test_compresstore_v64i32(ptr %p, <64 x i1> %mask, <64 x i32> %data)
 ; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV64-NEXT:    vse32.v v24, (a0)
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v0, 4
+; RV64-NEXT:    vslidedown.vi v24, v0, 4
 ; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    vcompress.vm v24, v16, v8
-; RV64-NEXT:    vcpop.m a2, v8
+; RV64-NEXT:    vcompress.vm v8, v16, v24
+; RV64-NEXT:    vcpop.m a2, v24
 ; RV64-NEXT:    cpopw a1, a1
 ; RV64-NEXT:    slli a1, a1, 2
 ; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV64-NEXT:    vse32.v v24, (a0)
+; RV64-NEXT:    vse32.v v8, (a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_compresstore_v64i32:
@@ -654,16 +654,16 @@ define void @test_compresstore_v64i32(ptr %p, <64 x i1> %mask, <64 x i32> %data)
 ; RV32-NEXT:    vcompress.vm v24, v8, v0
 ; RV32-NEXT:    vcpop.m a2, v0
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v0, 4
+; RV32-NEXT:    vslidedown.vi v7, v0, 4
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vcompress.vm v0, v16, v8
-; RV32-NEXT:    vcpop.m a1, v8
+; RV32-NEXT:    vcompress.vm v8, v16, v7
+; RV32-NEXT:    vcpop.m a1, v7
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vse32.v v24, (a0)
 ; RV32-NEXT:    slli a2, a2, 2
 ; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vse32.v v0, (a0)
+; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    ret
 entry:
   tail call void @llvm.masked.compressstore.v64i32(<64 x i32> %data, ptr align 4 %p, <64 x i1> %mask)
@@ -796,18 +796,18 @@ define void @test_compresstore_v32i64(ptr %p, <32 x i1> %mask, <32 x i64> %data)
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vse64.v v24, (a0)
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v0, 2
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
 ; RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64-NEXT:    vmv.x.s a1, v0
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vcompress.vm v24, v16, v8
+; RV64-NEXT:    vcompress.vm v8, v16, v24
 ; RV64-NEXT:    zext.h a1, a1
 ; RV64-NEXT:    cpopw a1, a1
 ; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vcpop.m a1, v8
+; RV64-NEXT:    vcpop.m a1, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vse64.v v24, (a0)
+; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_compresstore_v32i64:
@@ -818,18 +818,18 @@ define void @test_compresstore_v32i64(ptr %p, <32 x i1> %mask, <32 x i64> %data)
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vse64.v v24, (a0)
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v0, 2
+; RV32-NEXT:    vslidedown.vi v24, v0, 2
 ; RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV32-NEXT:    vmv.x.s a1, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vcompress.vm v24, v16, v8
+; RV32-NEXT:    vcompress.vm v8, v16, v24
 ; RV32-NEXT:    zext.h a1, a1
 ; RV32-NEXT:    cpop a1, a1
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    vcpop.m a1, v8
+; RV32-NEXT:    vcpop.m a1, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vse64.v v24, (a0)
+; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
 entry:
   tail call void @llvm.masked.compressstore.v32i64(<32 x i64> %data, ptr align 8 %p, <32 x i1> %mask)
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index 208735b18cbab..97e1a7f41b92f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -162,12 +162,12 @@ define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-F-LABEL: ctlz_nxv4i8:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    vzext.vf2 v10, v8
 ; CHECK-F-NEXT:    li a0, 134
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-F-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -176,12 +176,12 @@ define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-D-LABEL: ctlz_nxv4i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-D-NEXT:    vzext.vf2 v9, v8
+; CHECK-D-NEXT:    vzext.vf2 v10, v8
 ; CHECK-D-NEXT:    li a0, 134
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-D-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-D-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-D-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -225,13 +225,13 @@ define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-F-LABEL: ctlz_nxv8i8:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v10, v8
+; CHECK-F-NEXT:    vzext.vf2 v12, v8
 ; CHECK-F-NEXT:    li a0, 134
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-F-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-F-NEXT:    vnsrl.wi v10, v8, 0
-; CHECK-F-NEXT:    vrsub.vx v8, v10, a0
+; CHECK-F-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
@@ -239,13 +239,13 @@ define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-D-LABEL: ctlz_nxv8i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-D-NEXT:    vzext.vf2 v10, v8
+; CHECK-D-NEXT:    vzext.vf2 v12, v8
 ; CHECK-D-NEXT:    li a0, 134
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-D-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-D-NEXT:    vnsrl.wi v10, v8, 0
-; CHECK-D-NEXT:    vrsub.vx v8, v10, a0
+; CHECK-D-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
@@ -288,13 +288,13 @@ define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-F-LABEL: ctlz_nxv16i8:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v12, v8
+; CHECK-F-NEXT:    vzext.vf2 v16, v8
 ; CHECK-F-NEXT:    li a0, 134
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-F-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-F-NEXT:    vnsrl.wi v12, v8, 0
-; CHECK-F-NEXT:    vrsub.vx v8, v12, a0
+; CHECK-F-NEXT:    vnsrl.wi v8, v16, 0
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
@@ -302,13 +302,13 @@ define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-D-LABEL: ctlz_nxv16i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-D-NEXT:    vzext.vf2 v12, v8
+; CHECK-D-NEXT:    vzext.vf2 v16, v8
 ; CHECK-D-NEXT:    li a0, 134
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-D-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-D-NEXT:    vnsrl.wi v12, v8, 0
-; CHECK-D-NEXT:    vrsub.vx v8, v12, a0
+; CHECK-D-NEXT:    vnsrl.wi v8, v16, 0
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
@@ -1375,12 +1375,12 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-F-NEXT:    fsrmi a1, 1
 ; CHECK-F-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
-; CHECK-F-NEXT:    vmv.v.x v8, a0
-; CHECK-F-NEXT:    vsrl.vi v9, v10, 23
-; CHECK-F-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-F-NEXT:    vmv.v.x v11, a0
+; CHECK-F-NEXT:    vsrl.vi v10, v10, 23
+; CHECK-F-NEXT:    vwsubu.vv v8, v11, v10
 ; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v10, a0
+; CHECK-F-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-F-NEXT:    fsrm a1
 ; CHECK-F-NEXT:    ret
 ;
@@ -1515,12 +1515,12 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-F-NEXT:    fsrmi a1, 1
 ; CHECK-F-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
-; CHECK-F-NEXT:    vmv.v.x v8, a0
-; CHECK-F-NEXT:    vsrl.vi v10, v12, 23
-; CHECK-F-NEXT:    vwsubu.vv v12, v8, v10
+; CHECK-F-NEXT:    vmv.v.x v14, a0
+; CHECK-F-NEXT:    vsrl.vi v12, v12, 23
+; CHECK-F-NEXT:    vwsubu.vv v8, v14, v12
 ; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v12, a0
+; CHECK-F-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-F-NEXT:    fsrm a1
 ; CHECK-F-NEXT:    ret
 ;
@@ -1655,12 +1655,12 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-F-NEXT:    fsrmi a1, 1
 ; CHECK-F-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
-; CHECK-F-NEXT:    vmv.v.x v8, a0
-; CHECK-F-NEXT:    vsrl.vi v12, v16, 23
-; CHECK-F-NEXT:    vwsubu.vv v16, v8, v12
+; CHECK-F-NEXT:    vmv.v.x v20, a0
+; CHECK-F-NEXT:    vsrl.vi v16, v16, 23
+; CHECK-F-NEXT:    vwsubu.vv v8, v20, v16
 ; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v16, a0
+; CHECK-F-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-F-NEXT:    fsrm a1
 ; CHECK-F-NEXT:    ret
 ;
@@ -1832,11 +1832,11 @@ define <vscale x 4 x i8> @ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv4i8:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v9, v8
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT:    vzext.vf2 v10, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-F-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
@@ -1844,11 +1844,11 @@ define <vscale x 4 x i8> @ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-D-NEXT:    vzext.vf2 v9, v8
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-D-NEXT:    vzext.vf2 v10, v8
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-D-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-D-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-D-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
@@ -1890,25 +1890,25 @@ define <vscale x 8 x i8> @ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv8i8:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v10, v8
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT:    vzext.vf2 v12, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-F-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-F-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-F-NEXT:    vnsrl.wi v8, v12, 0
 ; CHECK-F-NEXT:    li a0, 134
-; CHECK-F-NEXT:    vrsub.vx v8, v10, a0
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-D-NEXT:    vzext.vf2 v10, v8
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-D-NEXT:    vzext.vf2 v12, v8
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-D-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-D-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-D-NEXT:    vnsrl.wi v8, v12, 0
 ; CHECK-D-NEXT:    li a0, 134
-; CHECK-D-NEXT:    vrsub.vx v8, v10, a0
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv8i8:
@@ -1948,25 +1948,25 @@ define <vscale x 16 x i8> @ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv16i8:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v12, v8
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT:    vzext.vf2 v16, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-F-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-F-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-F-NEXT:    vnsrl.wi v8, v16, 0
 ; CHECK-F-NEXT:    li a0, 134
-; CHECK-F-NEXT:    vrsub.vx v8, v12, a0
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv16i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-D-NEXT:    vzext.vf2 v12, v8
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-D-NEXT:    vzext.vf2 v16, v8
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-D-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-D-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-D-NEXT:    vnsrl.wi v8, v16, 0
 ; CHECK-D-NEXT:    li a0, 134
-; CHECK-D-NEXT:    vrsub.vx v8, v12, a0
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv16i8:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index ceca813782461..570ff34b0f23a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -120,11 +120,11 @@ define <vscale x 4 x i8> @vp_ctlz_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1
 ; CHECK-LABEL: vp_ctlz_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
@@ -147,12 +147,12 @@ define <vscale x 4 x i8> @vp_ctlz_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zer
 ; CHECK-LABEL: vp_ctlz_nxv4i8_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vzext.vf2 v10, v8
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -173,11 +173,11 @@ define <vscale x 8 x i8> @vp_ctlz_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1
 ; CHECK-LABEL: vp_ctlz_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
+; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
@@ -200,13 +200,13 @@ define <vscale x 8 x i8> @vp_ctlz_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zer
 ; CHECK-LABEL: vp_ctlz_nxv8i8_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    vzext.vf2 v12, v8
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0
-; CHECK-NEXT:    vrsub.vx v8, v10, a0
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
@@ -226,11 +226,11 @@ define <vscale x 16 x i8> @vp_ctlz_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK-LABEL: vp_ctlz_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
+; CHECK-NEXT:    vzext.vf2 v16, v8, v0.t
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
@@ -253,13 +253,13 @@ define <vscale x 16 x i8> @vp_ctlz_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32
 ; CHECK-LABEL: vp_ctlz_nxv16i8_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vzext.vf2 v16, v8
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0
-; CHECK-NEXT:    vrsub.vx v8, v12, a0
+; CHECK-NEXT:    vnsrl.wi v8, v16, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
@@ -1444,11 +1444,11 @@ define <vscale x 4 x i8> @vp_ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vsca
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
@@ -1469,11 +1469,11 @@ define <vscale x 4 x i8> @vp_ctlz_zero_undef_nxv4i8_unmasked(<vscale x 4 x i8> %
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i8_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
@@ -1492,11 +1492,11 @@ define <vscale x 8 x i8> @vp_ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vsca
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
+; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
@@ -1517,13 +1517,13 @@ define <vscale x 8 x i8> @vp_ctlz_zero_undef_nxv8i8_unmasked(<vscale x 8 x i8> %
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i8_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vrsub.vx v8, v10, a0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i8_unmasked:
@@ -1540,11 +1540,11 @@ define <vscale x 16 x i8> @vp_ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <v
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
+; CHECK-NEXT:    vzext.vf2 v16, v8, v0.t
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
@@ -1565,13 +1565,13 @@ define <vscale x 16 x i8> @vp_ctlz_zero_undef_nxv16i8_unmasked(<vscale x 16 x i8
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i8_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-NEXT:    vzext.vf2 v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vnsrl.wi v8, v16, 0
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vrsub.vx v8, v12, a0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i8_unmasked:
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
index bd7a20f9ef590..09aa487cb085f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
@@ -166,9 +166,9 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v10, v9
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-F-NEXT:    vnsrl.wi v9, v12, 23
+; CHECK-F-NEXT:    vzext.vf2 v12, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v12
+; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
@@ -183,9 +183,9 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-D-NEXT:    vzext.vf2 v10, v9
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-D-NEXT:    vnsrl.wi v9, v12, 23
+; CHECK-D-NEXT:    vzext.vf2 v12, v9
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v12
+; CHECK-D-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
@@ -561,10 +561,10 @@ define <vscale x 4 x i16> @cttz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v9
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
-; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    vand.vv v10, v8, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-F-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
@@ -575,10 +575,10 @@ define <vscale x 4 x i16> @cttz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v9
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
-; CHECK-D-NEXT:    vsub.vx v8, v8, a0
+; CHECK-D-NEXT:    vand.vv v10, v8, v9
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-D-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-D-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
@@ -627,10 +627,10 @@ define <vscale x 8 x i16> @cttz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v10
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
-; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    vand.vv v12, v8, v10
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-F-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
@@ -641,10 +641,10 @@ define <vscale x 8 x i16> @cttz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v10
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
-; CHECK-D-NEXT:    vsub.vx v8, v8, a0
+; CHECK-D-NEXT:    vand.vv v12, v8, v10
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-D-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-D-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
@@ -693,10 +693,10 @@ define <vscale x 16 x i16> @cttz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v12
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
-; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    vand.vv v16, v8, v12
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-F-NEXT:    vnsrl.wi v16, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v16, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
@@ -707,10 +707,10 @@ define <vscale x 16 x i16> @cttz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v12
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
-; CHECK-D-NEXT:    vsub.vx v8, v8, a0
+; CHECK-D-NEXT:    vand.vv v16, v8, v12
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-D-NEXT:    vnsrl.wi v16, v8, 23
+; CHECK-D-NEXT:    vsub.vx v8, v16, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
@@ -1327,11 +1327,11 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v10
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vwsubu.vx v10, v8, a1
+; CHECK-F-NEXT:    vsrl.vi v10, v8, 23
+; CHECK-F-NEXT:    vwsubu.vx v8, v10, a1
 ; CHECK-F-NEXT:    li a1, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1451,11 +1451,11 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v12
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vwsubu.vx v12, v8, a1
+; CHECK-F-NEXT:    vsrl.vi v12, v8, 23
+; CHECK-F-NEXT:    vwsubu.vx v8, v12, a1
 ; CHECK-F-NEXT:    li a1, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-F-NEXT:    vmerge.vxm v8, v12, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1575,11 +1575,11 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v16
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vwsubu.vx v16, v8, a1
+; CHECK-F-NEXT:    vsrl.vi v16, v8, 23
+; CHECK-F-NEXT:    vwsubu.vx v8, v16, a1
 ; CHECK-F-NEXT:    li a1, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-F-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1757,11 +1757,11 @@ define <vscale x 4 x i8> @cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v9, v8
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT:    vzext.vf2 v10, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-F-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
@@ -1772,11 +1772,11 @@ define <vscale x 4 x i8> @cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-D-NEXT:    vand.vv v8, v8, v9
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-D-NEXT:    vzext.vf2 v9, v8
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-D-NEXT:    vzext.vf2 v10, v8
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-D-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-D-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-D-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
@@ -1817,13 +1817,13 @@ define <vscale x 8 x i8> @cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v10, v8
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT:    vzext.vf2 v12, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-F-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-F-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-F-NEXT:    vnsrl.wi v8, v12, 0
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v8, v10, a0
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv8i8:
@@ -1832,13 +1832,13 @@ define <vscale x 8 x i8> @cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-D-NEXT:    vand.vv v8, v8, v9
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-D-NEXT:    vzext.vf2 v10, v8
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-D-NEXT:    vzext.vf2 v12, v8
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-D-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-D-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-D-NEXT:    vnsrl.wi v8, v12, 0
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v8, v10, a0
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv8i8:
@@ -1877,13 +1877,13 @@ define <vscale x 16 x i8> @cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-F-NEXT:    vand.vv v8, v8, v10
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v12, v8
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT:    vzext.vf2 v16, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-F-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-F-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-F-NEXT:    vnsrl.wi v8, v16, 0
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v8, v12, a0
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv16i8:
@@ -1892,13 +1892,13 @@ define <vscale x 16 x i8> @cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-D-NEXT:    vand.vv v8, v8, v10
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-D-NEXT:    vzext.vf2 v12, v8
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-D-NEXT:    vzext.vf2 v16, v8
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-D-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-D-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-D-NEXT:    vnsrl.wi v8, v16, 0
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v8, v12, a0
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv16i8:
@@ -2120,22 +2120,22 @@ define <vscale x 4 x i16> @cttz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v9
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT:    vand.vv v10, v8, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-F-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv4i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v9
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-D-NEXT:    vand.vv v10, v8, v9
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-D-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v8, v8, a0
+; CHECK-D-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv4i16:
@@ -2179,22 +2179,22 @@ define <vscale x 8 x i16> @cttz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v10
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT:    vand.vv v12, v8, v10
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-F-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv8i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v10
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-D-NEXT:    vand.vv v12, v8, v10
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-D-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v8, v8, a0
+; CHECK-D-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv8i16:
@@ -2238,22 +2238,22 @@ define <vscale x 16 x i16> @cttz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v12
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT:    vand.vv v16, v8, v12
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-F-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    vsub.vx v8, v16, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv16i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v12
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-D-NEXT:    vand.vv v16, v8, v12
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-D-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v8, v8, a0
+; CHECK-D-NEXT:    vsub.vx v8, v16, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv16i16:
@@ -2412,11 +2412,11 @@ define <vscale x 2 x i32> @cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-D-NEXT:    li a0, 52
-; CHECK-D-NEXT:    vand.vv v8, v8, v9
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-D-NEXT:    vnsrl.wx v8, v10, a0
+; CHECK-D-NEXT:    vand.vv v10, v8, v9
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-D-NEXT:    vnsrl.wx v10, v8, a0
 ; CHECK-D-NEXT:    li a0, 1023
-; CHECK-D-NEXT:    vsub.vx v8, v8, a0
+; CHECK-D-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv2i32:
@@ -2475,11 +2475,11 @@ define <vscale x 4 x i32> @cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-D-NEXT:    li a0, 52
-; CHECK-D-NEXT:    vand.vv v8, v8, v10
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-D-NEXT:    vnsrl.wx v8, v12, a0
+; CHECK-D-NEXT:    vand.vv v12, v8, v10
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-D-NEXT:    vnsrl.wx v12, v8, a0
 ; CHECK-D-NEXT:    li a0, 1023
-; CHECK-D-NEXT:    vsub.vx v8, v8, a0
+; CHECK-D-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv4i32:
@@ -2538,11 +2538,11 @@ define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-D-NEXT:    li a0, 52
-; CHECK-D-NEXT:    vand.vv v8, v8, v12
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-D-NEXT:    vnsrl.wx v8, v16, a0
+; CHECK-D-NEXT:    vand.vv v16, v8, v12
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-D-NEXT:    vnsrl.wx v16, v8, a0
 ; CHECK-D-NEXT:    li a0, 1023
-; CHECK-D-NEXT:    vsub.vx v8, v8, a0
+; CHECK-D-NEXT:    vsub.vx v8, v16, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv8i32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index 9f113d5c428ec..71136a6526c44 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -2727,10 +2727,10 @@ define <vscale x 4 x i8> @vp_cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vsca
 ; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9, v0.t
+; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
@@ -2754,11 +2754,11 @@ define <vscale x 4 x i8> @vp_cttz_zero_undef_nxv4i8_unmasked(<vscale x 4 x i8> %
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
@@ -2781,10 +2781,10 @@ define <vscale x 8 x i8> @vp_cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vsca
 ; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10, v0.t
+; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
@@ -2808,13 +2808,13 @@ define <vscale x 8 x i8> @vp_cttz_zero_undef_nxv8i8_unmasked(<vscale x 8 x i8> %
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vsub.vx v8, v10, a0
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i8_unmasked:
@@ -2835,10 +2835,10 @@ define <vscale x 16 x i8> @vp_cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <v
 ; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12, v0.t
+; CHECK-NEXT:    vzext.vf2 v16, v8, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
@@ -2862,13 +2862,13 @@ define <vscale x 16 x i8> @vp_cttz_zero_undef_nxv16i8_unmasked(<vscale x 16 x i8
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-NEXT:    vzext.vf2 v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vnsrl.wi v8, v16, 0
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vsub.vx v8, v12, a0
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i8_unmasked:
@@ -3101,10 +3101,10 @@ define <vscale x 4 x i16> @vp_cttz_zero_undef_nxv4i16(<vscale x 4 x i16> %va, <v
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
+; CHECK-NEXT:    vand.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
 ; CHECK-NEXT:    vsub.vx v8, v10, a0, v0.t
@@ -3124,11 +3124,11 @@ define <vscale x 4 x i16> @vp_cttz_zero_undef_nxv4i16_unmasked(<vscale x 4 x i16
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-NEXT:    vand.vv v10, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i16_unmasked:
@@ -3147,10 +3147,10 @@ define <vscale x 8 x i16> @vp_cttz_zero_undef_nxv8i16(<vscale x 8 x i16> %va, <v
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
+; CHECK-NEXT:    vand.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
 ; CHECK-NEXT:    vsub.vx v8, v12, a0, v0.t
@@ -3170,11 +3170,11 @@ define <vscale x 8 x i16> @vp_cttz_zero_undef_nxv8i16_unmasked(<vscale x 8 x i16
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-NEXT:    vand.vv v12, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i16_unmasked:
@@ -3193,10 +3193,10 @@ define <vscale x 16 x i16> @vp_cttz_zero_undef_nxv16i16(<vscale x 16 x i16> %va,
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0, v0.t
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
+; CHECK-NEXT:    vand.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    vsub.vx v8, v16, a0, v0.t
@@ -3216,11 +3216,11 @@ define <vscale x 16 x i16> @vp_cttz_zero_undef_nxv16i16_unmasked(<vscale x 16 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-NEXT:    vand.vv v16, v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    vsub.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i16_unmasked:
@@ -3362,10 +3362,10 @@ define <vscale x 2 x i32> @vp_cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va, <v
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
+; CHECK-NEXT:    vand.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
@@ -3387,11 +3387,11 @@ define <vscale x 2 x i32> @vp_cttz_zero_undef_nxv2i32_unmasked(<vscale x 2 x i32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vnsrl.wx v8, v10, a0
+; CHECK-NEXT:    vand.vv v10, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wx v10, v8, a0
 ; CHECK-NEXT:    li a0, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv2i32_unmasked:
@@ -3410,10 +3410,10 @@ define <vscale x 4 x i32> @vp_cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va, <v
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
+; CHECK-NEXT:    vand.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
@@ -3435,11 +3435,11 @@ define <vscale x 4 x i32> @vp_cttz_zero_undef_nxv4i32_unmasked(<vscale x 4 x i32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vnsrl.wx v8, v12, a0
+; CHECK-NEXT:    vand.vv v12, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wx v12, v8, a0
 ; CHECK-NEXT:    li a0, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i32_unmasked:
@@ -3458,10 +3458,10 @@ define <vscale x 8 x i32> @vp_cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va, <v
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0, v0.t
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
+; CHECK-NEXT:    vand.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
@@ -3483,11 +3483,11 @@ define <vscale x 8 x i32> @vp_cttz_zero_undef_nxv8i32_unmasked(<vscale x 8 x i32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    vnsrl.wx v8, v16, a0
+; CHECK-NEXT:    vand.vv v16, v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wx v16, v8, a0
 ; CHECK-NEXT:    li a0, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    vsub.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i32_unmasked:
diff --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
index 7fa5f808f71ab..25706bdec55c3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expandload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
@@ -644,12 +644,12 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV32-NEXT:    addi sp, sp, -16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    li a2, 24
-; CHECK-RV32-NEXT:    mul a1, a1, a2
+; CHECK-RV32-NEXT:    slli a1, a1, 5
 ; CHECK-RV32-NEXT:    sub sp, sp, a1
-; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    slli a1, a1, 4
+; CHECK-RV32-NEXT:    li a2, 24
+; CHECK-RV32-NEXT:    mul a1, a1, a2
 ; CHECK-RV32-NEXT:    add a1, sp, a1
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -662,34 +662,45 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-RV32-NEXT:    vcpop.m a4, v0
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v6, v0, a2
+; CHECK-RV32-NEXT:    vsrl.vx v16, v0, a2
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-RV32-NEXT:    vcpop.m a2, v7
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
-; CHECK-RV32-NEXT:    vle16.v v16, (a0)
+; CHECK-RV32-NEXT:    vle16.v v24, (a0)
+; CHECK-RV32-NEXT:    csrr a5, vlenb
+; CHECK-RV32-NEXT:    slli a5, a5, 4
+; CHECK-RV32-NEXT:    add a5, sp, a5
+; CHECK-RV32-NEXT:    addi a5, a5, 16
+; CHECK-RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a4, v6
+; CHECK-RV32-NEXT:    vmv.x.s a4, v16
 ; CHECK-RV32-NEXT:    cpop a4, a4
 ; CHECK-RV32-NEXT:    cpop a3, a3
 ; CHECK-RV32-NEXT:    add a3, a3, a4
 ; CHECK-RV32-NEXT:    slli a3, a3, 1
 ; CHECK-RV32-NEXT:    add a0, a0, a3
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-RV32-NEXT:    vle16.v v24, (a0)
+; CHECK-RV32-NEXT:    vle16.v v16, (a0)
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-RV32-NEXT:    viota.m v24, v0
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 4
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    viota.m v8, v7
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
+; CHECK-RV32-NEXT:    li a1, 24
+; CHECK-RV32-NEXT:    mul a0, a0, a1
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -703,8 +714,7 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    vmv.v.v v16, v24
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
+; CHECK-RV32-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-NEXT:    add sp, sp, a0
 ; CHECK-RV32-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -1806,22 +1816,22 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_66: # %else242
 ; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 1
+; CHECK-RV32-NEXT:    vslidedown.vi v24, v0, 1
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_68
 ; CHECK-RV32-NEXT:  # %bb.67: # %cond.load245
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 63
 ; CHECK-RV32-NEXT:    li a4, 62
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_68: # %else246
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a3, v16
+; CHECK-RV32-NEXT:    vmv.x.s a3, v24
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_69
 ; CHECK-RV32-NEXT:    j .LBB61_602
 ; CHECK-RV32-NEXT:  .LBB61_69: # %else250
@@ -1946,34 +1956,34 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_99: # %cond.load369
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 94
 ; CHECK-RV32-NEXT:    li a4, 93
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_100: # %else370
 ; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    vsrl.vx v24, v24, a1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_102
 ; CHECK-RV32-NEXT:  # %bb.101: # %cond.load373
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 95
 ; CHECK-RV32-NEXT:    li a4, 94
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_102: # %else374
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a2, v16
+; CHECK-RV32-NEXT:    vmv.x.s a2, v24
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_103
 ; CHECK-RV32-NEXT:    j .LBB61_632
 ; CHECK-RV32-NEXT:  .LBB61_103: # %else378
@@ -2110,22 +2120,22 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_134: # %else498
 ; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 2
+; CHECK-RV32-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_136
 ; CHECK-RV32-NEXT:  # %bb.135: # %cond.load501
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 127
 ; CHECK-RV32-NEXT:    li a4, 126
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_136: # %else502
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a3, v16
+; CHECK-RV32-NEXT:    vmv.x.s a3, v24
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_137
 ; CHECK-RV32-NEXT:    j .LBB61_662
 ; CHECK-RV32-NEXT:  .LBB61_137: # %else506
@@ -2250,34 +2260,34 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_167: # %cond.load625
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 158
 ; CHECK-RV32-NEXT:    li a4, 157
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_168: # %else626
 ; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    vsrl.vx v24, v24, a1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_170
 ; CHECK-RV32-NEXT:  # %bb.169: # %cond.load629
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 159
 ; CHECK-RV32-NEXT:    li a4, 158
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_170: # %else630
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a2, v16
+; CHECK-RV32-NEXT:    vmv.x.s a2, v24
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_171
 ; CHECK-RV32-NEXT:    j .LBB61_692
 ; CHECK-RV32-NEXT:  .LBB61_171: # %else634
@@ -2414,22 +2424,22 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_202: # %else754
 ; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 3
+; CHECK-RV32-NEXT:    vslidedown.vi v24, v0, 3
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_204
 ; CHECK-RV32-NEXT:  # %bb.203: # %cond.load757
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 191
 ; CHECK-RV32-NEXT:    li a4, 190
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_204: # %else758
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a3, v16
+; CHECK-RV32-NEXT:    vmv.x.s a3, v24
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_205
 ; CHECK-RV32-NEXT:    j .LBB61_722
 ; CHECK-RV32-NEXT:  .LBB61_205: # %else762
@@ -2554,34 +2564,34 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_235: # %cond.load881
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 222
 ; CHECK-RV32-NEXT:    li a4, 221
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_236: # %else882
 ; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    vsrl.vx v24, v24, a1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_238
 ; CHECK-RV32-NEXT:  # %bb.237: # %cond.load885
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 223
 ; CHECK-RV32-NEXT:    li a4, 222
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_238: # %else886
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a2, v16
+; CHECK-RV32-NEXT:    vmv.x.s a2, v24
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_239
 ; CHECK-RV32-NEXT:    j .LBB61_752
 ; CHECK-RV32-NEXT:  .LBB61_239: # %else890
@@ -2718,22 +2728,22 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_270: # %else1010
 ; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 4
+; CHECK-RV32-NEXT:    vslidedown.vi v24, v0, 4
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_272
 ; CHECK-RV32-NEXT:  # %bb.271: # %cond.load1013
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 255
 ; CHECK-RV32-NEXT:    li a4, 254
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_272: # %else1014
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a3, v16
+; CHECK-RV32-NEXT:    vmv.x.s a3, v24
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_273
 ; CHECK-RV32-NEXT:    j .LBB61_782
 ; CHECK-RV32-NEXT:  .LBB61_273: # %else1018
@@ -2859,28 +2869,28 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 286
 ; CHECK-RV32-NEXT:    li a4, 285
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_304: # %else1138
 ; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    vsrl.vx v24, v24, a1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_306
 ; CHECK-RV32-NEXT:  # %bb.305: # %cond.load1141
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 287
 ; CHECK-RV32-NEXT:    li a4, 286
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_306: # %else1142
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a2, v16
+; CHECK-RV32-NEXT:    vmv.x.s a2, v24
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_307
 ; CHECK-RV32-NEXT:    j .LBB61_812
 ; CHECK-RV32-NEXT:  .LBB61_307: # %else1146
@@ -3015,19 +3025,19 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_338: # %else1266
 ; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 5
+; CHECK-RV32-NEXT:    vslidedown.vi v24, v0, 5
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_340
 ; CHECK-RV32-NEXT:  # %bb.339: # %cond.load1269
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
+; CHECK-RV32-NEXT:    vmv.s.x v16, a3
 ; CHECK-RV32-NEXT:    li a3, 319
 ; CHECK-RV32-NEXT:    li a4, 318
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_340: # %else1270
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a3, v16
+; CHECK-RV32-NEXT:    vmv.x.s a3, v24
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_341
 ; CHECK-RV32-NEXT:    j .LBB61_842
 ; CHECK-RV32-NEXT:  .LBB61_341: # %else1274
@@ -3153,28 +3163,28 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 350
 ; CHECK-RV32-NEXT:    li a4, 349
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_372: # %else1394
 ; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    vsrl.vx v24, v24, a1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_374
 ; CHECK-RV32-NEXT:  # %bb.373: # %cond.load1397
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 351
 ; CHECK-RV32-NEXT:    li a4, 350
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_374: # %else1398
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a2, v16
+; CHECK-RV32-NEXT:    vmv.x.s a2, v24
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_375
 ; CHECK-RV32-NEXT:    j .LBB61_872
 ; CHECK-RV32-NEXT:  .LBB61_375: # %else1402
@@ -3309,19 +3319,19 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_406: # %else1522
 ; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 6
+; CHECK-RV32-NEXT:    vslidedown.vi v24, v0, 6
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_408
 ; CHECK-RV32-NEXT:  # %bb.407: # %cond.load1525
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
+; CHECK-RV32-NEXT:    vmv.s.x v16, a3
 ; CHECK-RV32-NEXT:    li a3, 383
 ; CHECK-RV32-NEXT:    li a4, 382
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_408: # %else1526
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a3, v16
+; CHECK-RV32-NEXT:    vmv.x.s a3, v24
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_409
 ; CHECK-RV32-NEXT:    j .LBB61_902
 ; CHECK-RV32-NEXT:  .LBB61_409: # %else1530
@@ -3447,28 +3457,28 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 414
 ; CHECK-RV32-NEXT:    li a4, 413
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_440: # %else1650
 ; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    vsrl.vx v24, v24, a1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_442
 ; CHECK-RV32-NEXT:  # %bb.441: # %cond.load1653
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 415
 ; CHECK-RV32-NEXT:    li a4, 414
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_442: # %else1654
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a2, v16
+; CHECK-RV32-NEXT:    vmv.x.s a2, v24
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_443
 ; CHECK-RV32-NEXT:    j .LBB61_932
 ; CHECK-RV32-NEXT:  .LBB61_443: # %else1658
@@ -3603,19 +3613,19 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_474: # %else1778
 ; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 7
+; CHECK-RV32-NEXT:    vslidedown.vi v24, v0, 7
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_476
 ; CHECK-RV32-NEXT:  # %bb.475: # %cond.load1781
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
+; CHECK-RV32-NEXT:    vmv.s.x v16, a3
 ; CHECK-RV32-NEXT:    li a3, 447
 ; CHECK-RV32-NEXT:    li a4, 446
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_476: # %else1782
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a3, v16
+; CHECK-RV32-NEXT:    vmv.x.s a3, v24
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_477
 ; CHECK-RV32-NEXT:    j .LBB61_962
 ; CHECK-RV32-NEXT:  .LBB61_477: # %else1786
@@ -3741,28 +3751,28 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 478
 ; CHECK-RV32-NEXT:    li a4, 477
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_508: # %else1906
 ; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    vsrl.vx v24, v24, a1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_510
 ; CHECK-RV32-NEXT:  # %bb.509: # %cond.load1909
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a1
+; CHECK-RV32-NEXT:    vmv.s.x v16, a1
 ; CHECK-RV32-NEXT:    li a1, 479
 ; CHECK-RV32-NEXT:    li a2, 478
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a2
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a2
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_510: # %else1910
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a1, v16
+; CHECK-RV32-NEXT:    vmv.x.s a1, v24
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_511
 ; CHECK-RV32-NEXT:    j .LBB61_992
 ; CHECK-RV32-NEXT:  .LBB61_511: # %else1914
@@ -4683,450 +4693,450 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_66
 ; CHECK-RV32-NEXT:  .LBB61_602: # %cond.load249
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a2
 ; CHECK-RV32-NEXT:    li a2, 64
 ; CHECK-RV32-NEXT:    li a4, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 1
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_603
 ; CHECK-RV32-NEXT:    j .LBB61_70
 ; CHECK-RV32-NEXT:  .LBB61_603: # %cond.load253
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 65
 ; CHECK-RV32-NEXT:    li a4, 64
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 2
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_604
 ; CHECK-RV32-NEXT:    j .LBB61_71
 ; CHECK-RV32-NEXT:  .LBB61_604: # %cond.load257
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 66
 ; CHECK-RV32-NEXT:    li a4, 65
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 4
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_605
 ; CHECK-RV32-NEXT:    j .LBB61_72
 ; CHECK-RV32-NEXT:  .LBB61_605: # %cond.load261
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 67
 ; CHECK-RV32-NEXT:    li a4, 66
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 8
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_606
 ; CHECK-RV32-NEXT:    j .LBB61_73
 ; CHECK-RV32-NEXT:  .LBB61_606: # %cond.load265
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 68
 ; CHECK-RV32-NEXT:    li a4, 67
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 16
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_607
 ; CHECK-RV32-NEXT:    j .LBB61_74
 ; CHECK-RV32-NEXT:  .LBB61_607: # %cond.load269
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 69
 ; CHECK-RV32-NEXT:    li a4, 68
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 32
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_608
 ; CHECK-RV32-NEXT:    j .LBB61_75
 ; CHECK-RV32-NEXT:  .LBB61_608: # %cond.load273
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 70
 ; CHECK-RV32-NEXT:    li a4, 69
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 64
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_609
 ; CHECK-RV32-NEXT:    j .LBB61_76
 ; CHECK-RV32-NEXT:  .LBB61_609: # %cond.load277
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 71
 ; CHECK-RV32-NEXT:    li a4, 70
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 128
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_610
 ; CHECK-RV32-NEXT:    j .LBB61_77
 ; CHECK-RV32-NEXT:  .LBB61_610: # %cond.load281
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 72
 ; CHECK-RV32-NEXT:    li a4, 71
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 256
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_611
 ; CHECK-RV32-NEXT:    j .LBB61_78
 ; CHECK-RV32-NEXT:  .LBB61_611: # %cond.load285
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 73
 ; CHECK-RV32-NEXT:    li a4, 72
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 512
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_612
 ; CHECK-RV32-NEXT:    j .LBB61_79
 ; CHECK-RV32-NEXT:  .LBB61_612: # %cond.load289
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 74
 ; CHECK-RV32-NEXT:    li a4, 73
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_613
 ; CHECK-RV32-NEXT:    j .LBB61_80
 ; CHECK-RV32-NEXT:  .LBB61_613: # %cond.load293
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 75
 ; CHECK-RV32-NEXT:    li a4, 74
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 20
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_614
 ; CHECK-RV32-NEXT:    j .LBB61_81
 ; CHECK-RV32-NEXT:  .LBB61_614: # %cond.load297
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 76
 ; CHECK-RV32-NEXT:    li a4, 75
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 19
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_615
 ; CHECK-RV32-NEXT:    j .LBB61_82
 ; CHECK-RV32-NEXT:  .LBB61_615: # %cond.load301
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 77
 ; CHECK-RV32-NEXT:    li a4, 76
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 18
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_616
 ; CHECK-RV32-NEXT:    j .LBB61_83
 ; CHECK-RV32-NEXT:  .LBB61_616: # %cond.load305
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 78
 ; CHECK-RV32-NEXT:    li a4, 77
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 17
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_617
 ; CHECK-RV32-NEXT:    j .LBB61_84
 ; CHECK-RV32-NEXT:  .LBB61_617: # %cond.load309
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 79
 ; CHECK-RV32-NEXT:    li a4, 78
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 16
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_618
 ; CHECK-RV32-NEXT:    j .LBB61_85
 ; CHECK-RV32-NEXT:  .LBB61_618: # %cond.load313
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 80
 ; CHECK-RV32-NEXT:    li a4, 79
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 15
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_619
 ; CHECK-RV32-NEXT:    j .LBB61_86
 ; CHECK-RV32-NEXT:  .LBB61_619: # %cond.load317
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 81
 ; CHECK-RV32-NEXT:    li a4, 80
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 14
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_620
 ; CHECK-RV32-NEXT:    j .LBB61_87
 ; CHECK-RV32-NEXT:  .LBB61_620: # %cond.load321
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 82
 ; CHECK-RV32-NEXT:    li a4, 81
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 13
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_621
 ; CHECK-RV32-NEXT:    j .LBB61_88
 ; CHECK-RV32-NEXT:  .LBB61_621: # %cond.load325
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 83
 ; CHECK-RV32-NEXT:    li a4, 82
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 12
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_622
 ; CHECK-RV32-NEXT:    j .LBB61_89
 ; CHECK-RV32-NEXT:  .LBB61_622: # %cond.load329
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 84
 ; CHECK-RV32-NEXT:    li a4, 83
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 11
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_623
 ; CHECK-RV32-NEXT:    j .LBB61_90
 ; CHECK-RV32-NEXT:  .LBB61_623: # %cond.load333
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 85
 ; CHECK-RV32-NEXT:    li a4, 84
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 10
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_624
 ; CHECK-RV32-NEXT:    j .LBB61_91
 ; CHECK-RV32-NEXT:  .LBB61_624: # %cond.load337
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 86
 ; CHECK-RV32-NEXT:    li a4, 85
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 9
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_625
 ; CHECK-RV32-NEXT:    j .LBB61_92
 ; CHECK-RV32-NEXT:  .LBB61_625: # %cond.load341
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 87
 ; CHECK-RV32-NEXT:    li a4, 86
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 8
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_626
 ; CHECK-RV32-NEXT:    j .LBB61_93
 ; CHECK-RV32-NEXT:  .LBB61_626: # %cond.load345
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 88
 ; CHECK-RV32-NEXT:    li a4, 87
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 7
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_627
 ; CHECK-RV32-NEXT:    j .LBB61_94
 ; CHECK-RV32-NEXT:  .LBB61_627: # %cond.load349
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 89
 ; CHECK-RV32-NEXT:    li a4, 88
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 6
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_628
 ; CHECK-RV32-NEXT:    j .LBB61_95
 ; CHECK-RV32-NEXT:  .LBB61_628: # %cond.load353
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 90
 ; CHECK-RV32-NEXT:    li a4, 89
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 5
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_629
 ; CHECK-RV32-NEXT:    j .LBB61_96
 ; CHECK-RV32-NEXT:  .LBB61_629: # %cond.load357
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 91
 ; CHECK-RV32-NEXT:    li a4, 90
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 4
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_630
 ; CHECK-RV32-NEXT:    j .LBB61_97
 ; CHECK-RV32-NEXT:  .LBB61_630: # %cond.load361
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 92
 ; CHECK-RV32-NEXT:    li a4, 91
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 3
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_631
 ; CHECK-RV32-NEXT:    j .LBB61_98
 ; CHECK-RV32-NEXT:  .LBB61_631: # %cond.load365
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 93
 ; CHECK-RV32-NEXT:    li a4, 92
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_1027
 ; CHECK-RV32-NEXT:    j .LBB61_99
@@ -5585,450 +5595,450 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_134
 ; CHECK-RV32-NEXT:  .LBB61_662: # %cond.load505
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 128
 ; CHECK-RV32-NEXT:    li a4, 127
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv2r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
-; CHECK-RV32-NEXT:    andi a2, a3, 1
+; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
+; CHECK-RV32-NEXT:    andi a2, a3, 1
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_663
 ; CHECK-RV32-NEXT:    j .LBB61_138
 ; CHECK-RV32-NEXT:  .LBB61_663: # %cond.load509
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 129
 ; CHECK-RV32-NEXT:    li a4, 128
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 2
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_664
 ; CHECK-RV32-NEXT:    j .LBB61_139
 ; CHECK-RV32-NEXT:  .LBB61_664: # %cond.load513
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 130
 ; CHECK-RV32-NEXT:    li a4, 129
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 4
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_665
 ; CHECK-RV32-NEXT:    j .LBB61_140
 ; CHECK-RV32-NEXT:  .LBB61_665: # %cond.load517
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 131
 ; CHECK-RV32-NEXT:    li a4, 130
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 8
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_666
 ; CHECK-RV32-NEXT:    j .LBB61_141
 ; CHECK-RV32-NEXT:  .LBB61_666: # %cond.load521
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 132
 ; CHECK-RV32-NEXT:    li a4, 131
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 16
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_667
 ; CHECK-RV32-NEXT:    j .LBB61_142
 ; CHECK-RV32-NEXT:  .LBB61_667: # %cond.load525
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 133
 ; CHECK-RV32-NEXT:    li a4, 132
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 32
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_668
 ; CHECK-RV32-NEXT:    j .LBB61_143
 ; CHECK-RV32-NEXT:  .LBB61_668: # %cond.load529
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 134
 ; CHECK-RV32-NEXT:    li a4, 133
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 64
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_669
 ; CHECK-RV32-NEXT:    j .LBB61_144
 ; CHECK-RV32-NEXT:  .LBB61_669: # %cond.load533
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 135
 ; CHECK-RV32-NEXT:    li a4, 134
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 128
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_670
 ; CHECK-RV32-NEXT:    j .LBB61_145
 ; CHECK-RV32-NEXT:  .LBB61_670: # %cond.load537
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 136
 ; CHECK-RV32-NEXT:    li a4, 135
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 256
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_671
 ; CHECK-RV32-NEXT:    j .LBB61_146
 ; CHECK-RV32-NEXT:  .LBB61_671: # %cond.load541
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 137
 ; CHECK-RV32-NEXT:    li a4, 136
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 512
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_672
 ; CHECK-RV32-NEXT:    j .LBB61_147
 ; CHECK-RV32-NEXT:  .LBB61_672: # %cond.load545
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 138
 ; CHECK-RV32-NEXT:    li a4, 137
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_673
 ; CHECK-RV32-NEXT:    j .LBB61_148
 ; CHECK-RV32-NEXT:  .LBB61_673: # %cond.load549
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 139
 ; CHECK-RV32-NEXT:    li a4, 138
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 20
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_674
 ; CHECK-RV32-NEXT:    j .LBB61_149
 ; CHECK-RV32-NEXT:  .LBB61_674: # %cond.load553
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 140
 ; CHECK-RV32-NEXT:    li a4, 139
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 19
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_675
 ; CHECK-RV32-NEXT:    j .LBB61_150
 ; CHECK-RV32-NEXT:  .LBB61_675: # %cond.load557
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 141
 ; CHECK-RV32-NEXT:    li a4, 140
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 18
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_676
 ; CHECK-RV32-NEXT:    j .LBB61_151
 ; CHECK-RV32-NEXT:  .LBB61_676: # %cond.load561
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 142
 ; CHECK-RV32-NEXT:    li a4, 141
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 17
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_677
 ; CHECK-RV32-NEXT:    j .LBB61_152
 ; CHECK-RV32-NEXT:  .LBB61_677: # %cond.load565
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 143
 ; CHECK-RV32-NEXT:    li a4, 142
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 16
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_678
 ; CHECK-RV32-NEXT:    j .LBB61_153
 ; CHECK-RV32-NEXT:  .LBB61_678: # %cond.load569
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 144
 ; CHECK-RV32-NEXT:    li a4, 143
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 15
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_679
 ; CHECK-RV32-NEXT:    j .LBB61_154
 ; CHECK-RV32-NEXT:  .LBB61_679: # %cond.load573
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 145
 ; CHECK-RV32-NEXT:    li a4, 144
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 14
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_680
 ; CHECK-RV32-NEXT:    j .LBB61_155
 ; CHECK-RV32-NEXT:  .LBB61_680: # %cond.load577
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 146
 ; CHECK-RV32-NEXT:    li a4, 145
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 13
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_681
 ; CHECK-RV32-NEXT:    j .LBB61_156
 ; CHECK-RV32-NEXT:  .LBB61_681: # %cond.load581
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 147
 ; CHECK-RV32-NEXT:    li a4, 146
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 12
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_682
 ; CHECK-RV32-NEXT:    j .LBB61_157
 ; CHECK-RV32-NEXT:  .LBB61_682: # %cond.load585
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 148
 ; CHECK-RV32-NEXT:    li a4, 147
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 11
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_683
 ; CHECK-RV32-NEXT:    j .LBB61_158
 ; CHECK-RV32-NEXT:  .LBB61_683: # %cond.load589
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 149
 ; CHECK-RV32-NEXT:    li a4, 148
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 10
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_684
 ; CHECK-RV32-NEXT:    j .LBB61_159
 ; CHECK-RV32-NEXT:  .LBB61_684: # %cond.load593
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 150
 ; CHECK-RV32-NEXT:    li a4, 149
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 9
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_685
 ; CHECK-RV32-NEXT:    j .LBB61_160
 ; CHECK-RV32-NEXT:  .LBB61_685: # %cond.load597
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 151
 ; CHECK-RV32-NEXT:    li a4, 150
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 8
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_686
 ; CHECK-RV32-NEXT:    j .LBB61_161
 ; CHECK-RV32-NEXT:  .LBB61_686: # %cond.load601
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 152
 ; CHECK-RV32-NEXT:    li a4, 151
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 7
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_687
 ; CHECK-RV32-NEXT:    j .LBB61_162
 ; CHECK-RV32-NEXT:  .LBB61_687: # %cond.load605
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 153
 ; CHECK-RV32-NEXT:    li a4, 152
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 6
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_688
 ; CHECK-RV32-NEXT:    j .LBB61_163
 ; CHECK-RV32-NEXT:  .LBB61_688: # %cond.load609
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 154
 ; CHECK-RV32-NEXT:    li a4, 153
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 5
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_689
 ; CHECK-RV32-NEXT:    j .LBB61_164
 ; CHECK-RV32-NEXT:  .LBB61_689: # %cond.load613
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 155
 ; CHECK-RV32-NEXT:    li a4, 154
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 4
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_690
 ; CHECK-RV32-NEXT:    j .LBB61_165
 ; CHECK-RV32-NEXT:  .LBB61_690: # %cond.load617
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 156
 ; CHECK-RV32-NEXT:    li a4, 155
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 3
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_691
 ; CHECK-RV32-NEXT:    j .LBB61_166
 ; CHECK-RV32-NEXT:  .LBB61_691: # %cond.load621
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 157
 ; CHECK-RV32-NEXT:    li a4, 156
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_1029
 ; CHECK-RV32-NEXT:    j .LBB61_167
@@ -6487,450 +6497,450 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_202
 ; CHECK-RV32-NEXT:  .LBB61_722: # %cond.load761
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 192
 ; CHECK-RV32-NEXT:    li a4, 191
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 1
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_723
 ; CHECK-RV32-NEXT:    j .LBB61_206
 ; CHECK-RV32-NEXT:  .LBB61_723: # %cond.load765
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 193
 ; CHECK-RV32-NEXT:    li a4, 192
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 2
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_724
 ; CHECK-RV32-NEXT:    j .LBB61_207
 ; CHECK-RV32-NEXT:  .LBB61_724: # %cond.load769
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 194
 ; CHECK-RV32-NEXT:    li a4, 193
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 4
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_725
 ; CHECK-RV32-NEXT:    j .LBB61_208
 ; CHECK-RV32-NEXT:  .LBB61_725: # %cond.load773
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 195
 ; CHECK-RV32-NEXT:    li a4, 194
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 8
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_726
 ; CHECK-RV32-NEXT:    j .LBB61_209
 ; CHECK-RV32-NEXT:  .LBB61_726: # %cond.load777
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 196
 ; CHECK-RV32-NEXT:    li a4, 195
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 16
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_727
 ; CHECK-RV32-NEXT:    j .LBB61_210
 ; CHECK-RV32-NEXT:  .LBB61_727: # %cond.load781
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 197
 ; CHECK-RV32-NEXT:    li a4, 196
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 32
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_728
 ; CHECK-RV32-NEXT:    j .LBB61_211
 ; CHECK-RV32-NEXT:  .LBB61_728: # %cond.load785
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 198
 ; CHECK-RV32-NEXT:    li a4, 197
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 64
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_729
 ; CHECK-RV32-NEXT:    j .LBB61_212
 ; CHECK-RV32-NEXT:  .LBB61_729: # %cond.load789
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 199
 ; CHECK-RV32-NEXT:    li a4, 198
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 128
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_730
 ; CHECK-RV32-NEXT:    j .LBB61_213
 ; CHECK-RV32-NEXT:  .LBB61_730: # %cond.load793
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 200
 ; CHECK-RV32-NEXT:    li a4, 199
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 256
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_731
 ; CHECK-RV32-NEXT:    j .LBB61_214
 ; CHECK-RV32-NEXT:  .LBB61_731: # %cond.load797
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 201
 ; CHECK-RV32-NEXT:    li a4, 200
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 512
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_732
 ; CHECK-RV32-NEXT:    j .LBB61_215
 ; CHECK-RV32-NEXT:  .LBB61_732: # %cond.load801
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 202
 ; CHECK-RV32-NEXT:    li a4, 201
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_733
 ; CHECK-RV32-NEXT:    j .LBB61_216
 ; CHECK-RV32-NEXT:  .LBB61_733: # %cond.load805
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 203
 ; CHECK-RV32-NEXT:    li a4, 202
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 20
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_734
 ; CHECK-RV32-NEXT:    j .LBB61_217
 ; CHECK-RV32-NEXT:  .LBB61_734: # %cond.load809
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 204
 ; CHECK-RV32-NEXT:    li a4, 203
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 19
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_735
 ; CHECK-RV32-NEXT:    j .LBB61_218
 ; CHECK-RV32-NEXT:  .LBB61_735: # %cond.load813
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 205
 ; CHECK-RV32-NEXT:    li a4, 204
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 18
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_736
 ; CHECK-RV32-NEXT:    j .LBB61_219
 ; CHECK-RV32-NEXT:  .LBB61_736: # %cond.load817
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 206
 ; CHECK-RV32-NEXT:    li a4, 205
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 17
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_737
 ; CHECK-RV32-NEXT:    j .LBB61_220
 ; CHECK-RV32-NEXT:  .LBB61_737: # %cond.load821
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 207
 ; CHECK-RV32-NEXT:    li a4, 206
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 16
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_738
 ; CHECK-RV32-NEXT:    j .LBB61_221
 ; CHECK-RV32-NEXT:  .LBB61_738: # %cond.load825
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 208
 ; CHECK-RV32-NEXT:    li a4, 207
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 15
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_739
 ; CHECK-RV32-NEXT:    j .LBB61_222
 ; CHECK-RV32-NEXT:  .LBB61_739: # %cond.load829
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 209
 ; CHECK-RV32-NEXT:    li a4, 208
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 14
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_740
 ; CHECK-RV32-NEXT:    j .LBB61_223
 ; CHECK-RV32-NEXT:  .LBB61_740: # %cond.load833
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 210
 ; CHECK-RV32-NEXT:    li a4, 209
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 13
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_741
 ; CHECK-RV32-NEXT:    j .LBB61_224
 ; CHECK-RV32-NEXT:  .LBB61_741: # %cond.load837
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 211
 ; CHECK-RV32-NEXT:    li a4, 210
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 12
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_742
 ; CHECK-RV32-NEXT:    j .LBB61_225
 ; CHECK-RV32-NEXT:  .LBB61_742: # %cond.load841
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 212
 ; CHECK-RV32-NEXT:    li a4, 211
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 11
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_743
 ; CHECK-RV32-NEXT:    j .LBB61_226
 ; CHECK-RV32-NEXT:  .LBB61_743: # %cond.load845
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 213
 ; CHECK-RV32-NEXT:    li a4, 212
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 10
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_744
 ; CHECK-RV32-NEXT:    j .LBB61_227
 ; CHECK-RV32-NEXT:  .LBB61_744: # %cond.load849
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 214
 ; CHECK-RV32-NEXT:    li a4, 213
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 9
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_745
 ; CHECK-RV32-NEXT:    j .LBB61_228
 ; CHECK-RV32-NEXT:  .LBB61_745: # %cond.load853
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 215
 ; CHECK-RV32-NEXT:    li a4, 214
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 8
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_746
 ; CHECK-RV32-NEXT:    j .LBB61_229
 ; CHECK-RV32-NEXT:  .LBB61_746: # %cond.load857
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 216
 ; CHECK-RV32-NEXT:    li a4, 215
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 7
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_747
 ; CHECK-RV32-NEXT:    j .LBB61_230
 ; CHECK-RV32-NEXT:  .LBB61_747: # %cond.load861
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 217
 ; CHECK-RV32-NEXT:    li a4, 216
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 6
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_748
 ; CHECK-RV32-NEXT:    j .LBB61_231
 ; CHECK-RV32-NEXT:  .LBB61_748: # %cond.load865
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 218
 ; CHECK-RV32-NEXT:    li a4, 217
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 5
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_749
 ; CHECK-RV32-NEXT:    j .LBB61_232
 ; CHECK-RV32-NEXT:  .LBB61_749: # %cond.load869
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 219
 ; CHECK-RV32-NEXT:    li a4, 218
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 4
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_750
 ; CHECK-RV32-NEXT:    j .LBB61_233
 ; CHECK-RV32-NEXT:  .LBB61_750: # %cond.load873
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 220
 ; CHECK-RV32-NEXT:    li a4, 219
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 3
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_751
 ; CHECK-RV32-NEXT:    j .LBB61_234
 ; CHECK-RV32-NEXT:  .LBB61_751: # %cond.load877
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 221
 ; CHECK-RV32-NEXT:    li a4, 220
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_1031
 ; CHECK-RV32-NEXT:    j .LBB61_235
@@ -7389,15 +7399,15 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_270
 ; CHECK-RV32-NEXT:  .LBB61_782: # %cond.load1017
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 256
 ; CHECK-RV32-NEXT:    li a4, 255
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a2, a3, 1
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_783
 ; CHECK-RV32-NEXT:    j .LBB61_274
@@ -7405,11 +7415,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 257
 ; CHECK-RV32-NEXT:    li a4, 256
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 2
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_784
@@ -7418,11 +7428,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 258
 ; CHECK-RV32-NEXT:    li a4, 257
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 4
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_785
@@ -7431,11 +7441,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 259
 ; CHECK-RV32-NEXT:    li a4, 258
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 8
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_786
@@ -7444,11 +7454,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 260
 ; CHECK-RV32-NEXT:    li a4, 259
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 16
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_787
@@ -7457,11 +7467,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 261
 ; CHECK-RV32-NEXT:    li a4, 260
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 32
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_788
@@ -7470,11 +7480,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 262
 ; CHECK-RV32-NEXT:    li a4, 261
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 64
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_789
@@ -7483,11 +7493,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 263
 ; CHECK-RV32-NEXT:    li a4, 262
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 128
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_790
@@ -7496,11 +7506,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 264
 ; CHECK-RV32-NEXT:    li a4, 263
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 256
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_791
@@ -7509,11 +7519,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 265
 ; CHECK-RV32-NEXT:    li a4, 264
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 512
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_792
@@ -7522,11 +7532,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 266
 ; CHECK-RV32-NEXT:    li a4, 265
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_793
@@ -7535,11 +7545,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 267
 ; CHECK-RV32-NEXT:    li a4, 266
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 20
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_794
@@ -7548,11 +7558,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 268
 ; CHECK-RV32-NEXT:    li a4, 267
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 19
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_795
@@ -7561,11 +7571,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 269
 ; CHECK-RV32-NEXT:    li a4, 268
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 18
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_796
@@ -7574,11 +7584,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 270
 ; CHECK-RV32-NEXT:    li a4, 269
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 17
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_797
@@ -7587,11 +7597,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 271
 ; CHECK-RV32-NEXT:    li a4, 270
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 16
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_798
@@ -7600,11 +7610,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 272
 ; CHECK-RV32-NEXT:    li a4, 271
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 15
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_799
@@ -7613,11 +7623,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 273
 ; CHECK-RV32-NEXT:    li a4, 272
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 14
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_800
@@ -7626,11 +7636,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 274
 ; CHECK-RV32-NEXT:    li a4, 273
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 13
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_801
@@ -7639,11 +7649,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 275
 ; CHECK-RV32-NEXT:    li a4, 274
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 12
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_802
@@ -7652,11 +7662,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 276
 ; CHECK-RV32-NEXT:    li a4, 275
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 11
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_803
@@ -7665,11 +7675,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 277
 ; CHECK-RV32-NEXT:    li a4, 276
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 10
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_804
@@ -7678,11 +7688,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 278
 ; CHECK-RV32-NEXT:    li a4, 277
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 9
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_805
@@ -7691,11 +7701,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 279
 ; CHECK-RV32-NEXT:    li a4, 278
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 8
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_806
@@ -7704,11 +7714,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 280
 ; CHECK-RV32-NEXT:    li a4, 279
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 7
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_807
@@ -7717,11 +7727,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 281
 ; CHECK-RV32-NEXT:    li a4, 280
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 6
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_808
@@ -7730,11 +7740,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 282
 ; CHECK-RV32-NEXT:    li a4, 281
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 5
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_809
@@ -7743,11 +7753,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 283
 ; CHECK-RV32-NEXT:    li a4, 282
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 4
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_810
@@ -7756,11 +7766,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 284
 ; CHECK-RV32-NEXT:    li a4, 283
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 3
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_811
@@ -7769,11 +7779,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 285
 ; CHECK-RV32-NEXT:    li a4, 284
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_1033
@@ -8172,11 +8182,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_338
 ; CHECK-RV32-NEXT:  .LBB61_842: # %cond.load1273
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 320
 ; CHECK-RV32-NEXT:    li a4, 319
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 1
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_843
@@ -8185,11 +8195,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 321
 ; CHECK-RV32-NEXT:    li a4, 320
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 2
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_844
@@ -8198,11 +8208,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 322
 ; CHECK-RV32-NEXT:    li a4, 321
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 4
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_845
@@ -8211,11 +8221,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 323
 ; CHECK-RV32-NEXT:    li a4, 322
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 8
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_846
@@ -8224,11 +8234,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 324
 ; CHECK-RV32-NEXT:    li a4, 323
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 16
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_847
@@ -8237,11 +8247,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 325
 ; CHECK-RV32-NEXT:    li a4, 324
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 32
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_848
@@ -8250,11 +8260,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 326
 ; CHECK-RV32-NEXT:    li a4, 325
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 64
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_849
@@ -8263,11 +8273,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 327
 ; CHECK-RV32-NEXT:    li a4, 326
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 128
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_850
@@ -8276,11 +8286,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 328
 ; CHECK-RV32-NEXT:    li a4, 327
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 256
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_851
@@ -8289,11 +8299,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 329
 ; CHECK-RV32-NEXT:    li a4, 328
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 512
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_852
@@ -8302,11 +8312,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 330
 ; CHECK-RV32-NEXT:    li a4, 329
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_853
@@ -8315,11 +8325,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 331
 ; CHECK-RV32-NEXT:    li a4, 330
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 20
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_854
@@ -8328,11 +8338,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 332
 ; CHECK-RV32-NEXT:    li a4, 331
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 19
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_855
@@ -8341,11 +8351,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 333
 ; CHECK-RV32-NEXT:    li a4, 332
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 18
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_856
@@ -8354,11 +8364,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 334
 ; CHECK-RV32-NEXT:    li a4, 333
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 17
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_857
@@ -8367,11 +8377,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 335
 ; CHECK-RV32-NEXT:    li a4, 334
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 16
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_858
@@ -8380,11 +8390,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 336
 ; CHECK-RV32-NEXT:    li a4, 335
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 15
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_859
@@ -8393,11 +8403,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 337
 ; CHECK-RV32-NEXT:    li a4, 336
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 14
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_860
@@ -8406,11 +8416,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 338
 ; CHECK-RV32-NEXT:    li a4, 337
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 13
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_861
@@ -8419,11 +8429,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 339
 ; CHECK-RV32-NEXT:    li a4, 338
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 12
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_862
@@ -8432,11 +8442,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 340
 ; CHECK-RV32-NEXT:    li a4, 339
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 11
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_863
@@ -8445,11 +8455,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 341
 ; CHECK-RV32-NEXT:    li a4, 340
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 10
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_864
@@ -8458,11 +8468,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 342
 ; CHECK-RV32-NEXT:    li a4, 341
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 9
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_865
@@ -8471,11 +8481,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 343
 ; CHECK-RV32-NEXT:    li a4, 342
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 8
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_866
@@ -8484,11 +8494,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 344
 ; CHECK-RV32-NEXT:    li a4, 343
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 7
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_867
@@ -8497,11 +8507,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 345
 ; CHECK-RV32-NEXT:    li a4, 344
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 6
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_868
@@ -8510,11 +8520,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 346
 ; CHECK-RV32-NEXT:    li a4, 345
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 5
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_869
@@ -8523,11 +8533,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 347
 ; CHECK-RV32-NEXT:    li a4, 346
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 4
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_870
@@ -8536,11 +8546,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 348
 ; CHECK-RV32-NEXT:    li a4, 347
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 3
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_871
@@ -8549,11 +8559,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 349
 ; CHECK-RV32-NEXT:    li a4, 348
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_1035
@@ -8952,11 +8962,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_406
 ; CHECK-RV32-NEXT:  .LBB61_902: # %cond.load1529
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 384
 ; CHECK-RV32-NEXT:    li a4, 383
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 1
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_903
@@ -8965,11 +8975,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 385
 ; CHECK-RV32-NEXT:    li a4, 384
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 2
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_904
@@ -8978,11 +8988,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 386
 ; CHECK-RV32-NEXT:    li a4, 385
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 4
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_905
@@ -8991,11 +9001,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 387
 ; CHECK-RV32-NEXT:    li a4, 386
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 8
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_906
@@ -9004,11 +9014,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 388
 ; CHECK-RV32-NEXT:    li a4, 387
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 16
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_907
@@ -9017,11 +9027,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 389
 ; CHECK-RV32-NEXT:    li a4, 388
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 32
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_908
@@ -9030,11 +9040,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 390
 ; CHECK-RV32-NEXT:    li a4, 389
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 64
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_909
@@ -9043,11 +9053,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 391
 ; CHECK-RV32-NEXT:    li a4, 390
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 128
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_910
@@ -9056,11 +9066,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 392
 ; CHECK-RV32-NEXT:    li a4, 391
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 256
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_911
@@ -9069,11 +9079,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 393
 ; CHECK-RV32-NEXT:    li a4, 392
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 512
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_912
@@ -9082,11 +9092,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 394
 ; CHECK-RV32-NEXT:    li a4, 393
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_913
@@ -9095,11 +9105,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 395
 ; CHECK-RV32-NEXT:    li a4, 394
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 20
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_914
@@ -9108,11 +9118,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 396
 ; CHECK-RV32-NEXT:    li a4, 395
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 19
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_915
@@ -9121,11 +9131,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 397
 ; CHECK-RV32-NEXT:    li a4, 396
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 18
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_916
@@ -9134,11 +9144,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 398
 ; CHECK-RV32-NEXT:    li a4, 397
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 17
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_917
@@ -9147,11 +9157,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 399
 ; CHECK-RV32-NEXT:    li a4, 398
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 16
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_918
@@ -9160,11 +9170,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 400
 ; CHECK-RV32-NEXT:    li a4, 399
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 15
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_919
@@ -9173,11 +9183,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 401
 ; CHECK-RV32-NEXT:    li a4, 400
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 14
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_920
@@ -9186,11 +9196,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 402
 ; CHECK-RV32-NEXT:    li a4, 401
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 13
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_921
@@ -9199,11 +9209,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 403
 ; CHECK-RV32-NEXT:    li a4, 402
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 12
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_922
@@ -9212,11 +9222,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 404
 ; CHECK-RV32-NEXT:    li a4, 403
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 11
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_923
@@ -9225,11 +9235,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 405
 ; CHECK-RV32-NEXT:    li a4, 404
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 10
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_924
@@ -9238,11 +9248,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 406
 ; CHECK-RV32-NEXT:    li a4, 405
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 9
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_925
@@ -9251,11 +9261,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 407
 ; CHECK-RV32-NEXT:    li a4, 406
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 8
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_926
@@ -9264,11 +9274,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 408
 ; CHECK-RV32-NEXT:    li a4, 407
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 7
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_927
@@ -9277,11 +9287,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 409
 ; CHECK-RV32-NEXT:    li a4, 408
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 6
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_928
@@ -9290,11 +9300,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 410
 ; CHECK-RV32-NEXT:    li a4, 409
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 5
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_929
@@ -9303,11 +9313,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 411
 ; CHECK-RV32-NEXT:    li a4, 410
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 4
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_930
@@ -9316,11 +9326,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 412
 ; CHECK-RV32-NEXT:    li a4, 411
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 3
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_931
@@ -9329,11 +9339,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 413
 ; CHECK-RV32-NEXT:    li a4, 412
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_1037
@@ -9732,11 +9742,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_474
 ; CHECK-RV32-NEXT:  .LBB61_962: # %cond.load1785
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 448
 ; CHECK-RV32-NEXT:    li a4, 447
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 1
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_963
@@ -9745,11 +9755,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 449
 ; CHECK-RV32-NEXT:    li a4, 448
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 2
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_964
@@ -9758,11 +9768,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 450
 ; CHECK-RV32-NEXT:    li a4, 449
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 4
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_965
@@ -9771,11 +9781,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 451
 ; CHECK-RV32-NEXT:    li a4, 450
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 8
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_966
@@ -9784,11 +9794,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 452
 ; CHECK-RV32-NEXT:    li a4, 451
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 16
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_967
@@ -9797,11 +9807,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 453
 ; CHECK-RV32-NEXT:    li a4, 452
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 32
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_968
@@ -9810,11 +9820,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 454
 ; CHECK-RV32-NEXT:    li a4, 453
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 64
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_969
@@ -9823,11 +9833,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 455
 ; CHECK-RV32-NEXT:    li a4, 454
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 128
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_970
@@ -9836,11 +9846,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 456
 ; CHECK-RV32-NEXT:    li a4, 455
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 256
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_971
@@ -9849,11 +9859,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 457
 ; CHECK-RV32-NEXT:    li a4, 456
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 512
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_972
@@ -9862,11 +9872,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 458
 ; CHECK-RV32-NEXT:    li a4, 457
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    andi a2, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_973
@@ -9875,11 +9885,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 459
 ; CHECK-RV32-NEXT:    li a4, 458
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 20
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_974
@@ -9888,11 +9898,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 460
 ; CHECK-RV32-NEXT:    li a4, 459
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 19
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_975
@@ -9901,11 +9911,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 461
 ; CHECK-RV32-NEXT:    li a4, 460
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 18
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_976
@@ -9914,11 +9924,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 462
 ; CHECK-RV32-NEXT:    li a4, 461
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 17
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_977
@@ -9927,11 +9937,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 463
 ; CHECK-RV32-NEXT:    li a4, 462
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 16
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_978
@@ -9940,11 +9950,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 464
 ; CHECK-RV32-NEXT:    li a4, 463
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 15
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_979
@@ -9953,11 +9963,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 465
 ; CHECK-RV32-NEXT:    li a4, 464
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 14
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_980
@@ -9966,11 +9976,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 466
 ; CHECK-RV32-NEXT:    li a4, 465
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 13
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_981
@@ -9979,11 +9989,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 467
 ; CHECK-RV32-NEXT:    li a4, 466
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 12
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_982
@@ -9992,11 +10002,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 468
 ; CHECK-RV32-NEXT:    li a4, 467
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 11
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_983
@@ -10005,11 +10015,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 469
 ; CHECK-RV32-NEXT:    li a4, 468
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 10
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_984
@@ -10018,11 +10028,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 470
 ; CHECK-RV32-NEXT:    li a4, 469
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 9
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_985
@@ -10031,11 +10041,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 471
 ; CHECK-RV32-NEXT:    li a4, 470
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 8
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_986
@@ -10044,11 +10054,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 472
 ; CHECK-RV32-NEXT:    li a4, 471
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 7
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_987
@@ -10057,11 +10067,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 473
 ; CHECK-RV32-NEXT:    li a4, 472
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 6
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_988
@@ -10070,11 +10080,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 474
 ; CHECK-RV32-NEXT:    li a4, 473
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 5
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_989
@@ -10083,11 +10093,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 475
 ; CHECK-RV32-NEXT:    li a4, 474
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 4
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_990
@@ -10096,11 +10106,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 476
 ; CHECK-RV32-NEXT:    li a4, 475
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 3
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_991
@@ -10109,11 +10119,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a2
+; CHECK-RV32-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV32-NEXT:    li a2, 477
 ; CHECK-RV32-NEXT:    li a4, 476
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_1039
@@ -10807,22 +10817,22 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_63: # %else242
 ; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 1
+; CHECK-RV64-NEXT:    vslidedown.vi v24, v0, 1
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_65
 ; CHECK-RV64-NEXT:  # %bb.64: # %cond.load245
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 63
 ; CHECK-RV64-NEXT:    li a3, 62
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:  .LBB61_65: # %else246
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.x.s a1, v16
+; CHECK-RV64-NEXT:    vmv.x.s a1, v24
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_66
 ; CHECK-RV64-NEXT:    j .LBB61_588
 ; CHECK-RV64-NEXT:  .LBB61_66: # %else250
@@ -11087,22 +11097,22 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_129: # %else498
 ; CHECK-RV64-NEXT:    slli a2, a1, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 2
+; CHECK-RV64-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_131
 ; CHECK-RV64-NEXT:  # %bb.130: # %cond.load501
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 127
 ; CHECK-RV64-NEXT:    li a3, 126
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv2r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:  .LBB61_131: # %else502
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.x.s a2, v16
+; CHECK-RV64-NEXT:    vmv.x.s a2, v24
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_132
 ; CHECK-RV64-NEXT:    j .LBB61_650
 ; CHECK-RV64-NEXT:  .LBB61_132: # %else506
@@ -11367,22 +11377,22 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_195: # %else754
 ; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 3
+; CHECK-RV64-NEXT:    vslidedown.vi v24, v0, 3
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_197
 ; CHECK-RV64-NEXT:  # %bb.196: # %cond.load757
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 191
 ; CHECK-RV64-NEXT:    li a3, 190
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:  .LBB61_197: # %else758
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.x.s a1, v16
+; CHECK-RV64-NEXT:    vmv.x.s a1, v24
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_198
 ; CHECK-RV64-NEXT:    j .LBB61_712
 ; CHECK-RV64-NEXT:  .LBB61_198: # %else762
@@ -11647,22 +11657,22 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_261: # %else1010
 ; CHECK-RV64-NEXT:    slli a2, a1, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 4
+; CHECK-RV64-NEXT:    vslidedown.vi v24, v0, 4
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_263
 ; CHECK-RV64-NEXT:  # %bb.262: # %cond.load1013
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 255
 ; CHECK-RV64-NEXT:    li a3, 254
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:  .LBB61_263: # %else1014
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.x.s a2, v16
+; CHECK-RV64-NEXT:    vmv.x.s a2, v24
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_264
 ; CHECK-RV64-NEXT:    j .LBB61_774
 ; CHECK-RV64-NEXT:  .LBB61_264: # %else1018
@@ -11925,19 +11935,19 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_327: # %else1266
 ; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 5
+; CHECK-RV64-NEXT:    vslidedown.vi v24, v0, 5
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_329
 ; CHECK-RV64-NEXT:  # %bb.328: # %cond.load1269
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
+; CHECK-RV64-NEXT:    vmv.s.x v16, a1
 ; CHECK-RV64-NEXT:    li a1, 319
 ; CHECK-RV64-NEXT:    li a3, 318
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:  .LBB61_329: # %else1270
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.x.s a1, v16
+; CHECK-RV64-NEXT:    vmv.x.s a1, v24
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_330
 ; CHECK-RV64-NEXT:    j .LBB61_836
 ; CHECK-RV64-NEXT:  .LBB61_330: # %else1274
@@ -12200,19 +12210,19 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_393: # %else1522
 ; CHECK-RV64-NEXT:    slli a2, a1, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 6
+; CHECK-RV64-NEXT:    vslidedown.vi v24, v0, 6
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_395
 ; CHECK-RV64-NEXT:  # %bb.394: # %cond.load1525
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
+; CHECK-RV64-NEXT:    vmv.s.x v16, a2
 ; CHECK-RV64-NEXT:    li a2, 383
 ; CHECK-RV64-NEXT:    li a3, 382
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:  .LBB61_395: # %else1526
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.x.s a2, v16
+; CHECK-RV64-NEXT:    vmv.x.s a2, v24
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_396
 ; CHECK-RV64-NEXT:    j .LBB61_898
 ; CHECK-RV64-NEXT:  .LBB61_396: # %else1530
@@ -12475,19 +12485,19 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_459: # %else1778
 ; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 7
+; CHECK-RV64-NEXT:    vslidedown.vi v24, v0, 7
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_461
 ; CHECK-RV64-NEXT:  # %bb.460: # %cond.load1781
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
+; CHECK-RV64-NEXT:    vmv.s.x v16, a1
 ; CHECK-RV64-NEXT:    li a1, 447
 ; CHECK-RV64-NEXT:    li a3, 446
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:  .LBB61_461: # %else1782
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.x.s a1, v16
+; CHECK-RV64-NEXT:    vmv.x.s a1, v24
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_462
 ; CHECK-RV64-NEXT:    j .LBB61_960
 ; CHECK-RV64-NEXT:  .LBB61_462: # %else1786
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll
index 3153b44386d7a..2356237d790b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll
@@ -153,10 +153,10 @@ define void @abs_v4i64_of_sext_v4i8(ptr %x) {
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-NEXT:    vmax.vv v8, v8, v9
+; CHECK-NEXT:    vmax.vv v10, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vse64.v v10, (a0)
+; CHECK-NEXT:    vzext.vf8 v8, v10
+; CHECK-NEXT:    vse64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <4 x i8>, ptr %x
   %a.ext = sext <4 x i8> %a to <4 x i64>
@@ -171,10 +171,10 @@ define void @abs_v4i64_of_sext_v4i16(ptr %x) {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-NEXT:    vmax.vv v8, v8, v9
+; CHECK-NEXT:    vmax.vv v10, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf4 v10, v8
-; CHECK-NEXT:    vse64.v v10, (a0)
+; CHECK-NEXT:    vzext.vf4 v8, v10
+; CHECK-NEXT:    vse64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %x
   %a.ext = sext <4 x i16> %a to <4 x i64>
@@ -189,10 +189,10 @@ define void @abs_v4i64_of_sext_v4i32(ptr %x) {
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-NEXT:    vmax.vv v8, v8, v9
+; CHECK-NEXT:    vmax.vv v10, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vse64.v v10, (a0)
+; CHECK-NEXT:    vzext.vf2 v8, v10
+; CHECK-NEXT:    vse64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <4 x i32>, ptr %x
   %a.ext = sext <4 x i32> %a to <4 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
index 4bd4a9a854f36..829cc9bac7ed3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -40,12 +40,12 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVF-NEXT:    vle8.v v8, (a0)
 ; RVF-NEXT:    li a1, 134
-; RVF-NEXT:    vzext.vf2 v10, v8
-; RVF-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVF-NEXT:    vnsrl.wi v8, v12, 23
+; RVF-NEXT:    vzext.vf2 v12, v8
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVF-NEXT:    vnsrl.wi v12, v8, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RVF-NEXT:    vnsrl.wi v10, v8, 0
-; RVF-NEXT:    vrsub.vx v8, v10, a1
+; RVF-NEXT:    vnsrl.wi v8, v12, 0
+; RVF-NEXT:    vrsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 8
 ; RVF-NEXT:    vminu.vx v8, v8, a1
 ; RVF-NEXT:    vse8.v v8, (a0)
@@ -56,12 +56,12 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVD-NEXT:    vle8.v v8, (a0)
 ; RVD-NEXT:    li a1, 134
-; RVD-NEXT:    vzext.vf2 v10, v8
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVD-NEXT:    vnsrl.wi v8, v12, 23
+; RVD-NEXT:    vzext.vf2 v12, v8
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVD-NEXT:    vnsrl.wi v12, v8, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RVD-NEXT:    vnsrl.wi v10, v8, 0
-; RVD-NEXT:    vrsub.vx v8, v10, a1
+; RVD-NEXT:    vnsrl.wi v8, v12, 0
+; RVD-NEXT:    vrsub.vx v8, v8, a1
 ; RVD-NEXT:    li a1, 8
 ; RVD-NEXT:    vminu.vx v8, v8, a1
 ; RVD-NEXT:    vse8.v v8, (a0)
@@ -121,11 +121,11 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVF-LABEL: ctlz_v8i16:
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RVF-NEXT:    vle16.v v8, (a0)
+; RVF-NEXT:    vle16.v v10, (a0)
 ; RVF-NEXT:    li a1, 142
-; RVF-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVF-NEXT:    vnsrl.wi v8, v10, 23
-; RVF-NEXT:    vrsub.vx v8, v8, a1
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v10
+; RVF-NEXT:    vnsrl.wi v10, v8, 23
+; RVF-NEXT:    vrsub.vx v8, v10, a1
 ; RVF-NEXT:    li a1, 16
 ; RVF-NEXT:    vminu.vx v8, v8, a1
 ; RVF-NEXT:    vse16.v v8, (a0)
@@ -134,11 +134,11 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVD-LABEL: ctlz_v8i16:
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RVD-NEXT:    vle16.v v8, (a0)
+; RVD-NEXT:    vle16.v v10, (a0)
 ; RVD-NEXT:    li a1, 142
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVD-NEXT:    vnsrl.wi v8, v10, 23
-; RVD-NEXT:    vrsub.vx v8, v8, a1
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v10
+; RVD-NEXT:    vnsrl.wi v10, v8, 23
+; RVD-NEXT:    vrsub.vx v8, v10, a1
 ; RVD-NEXT:    li a1, 16
 ; RVD-NEXT:    vminu.vx v8, v8, a1
 ; RVD-NEXT:    vse16.v v8, (a0)
@@ -216,12 +216,12 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVD-LABEL: ctlz_v4i32:
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    vle32.v v10, (a0)
 ; RVD-NEXT:    li a1, 52
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVD-NEXT:    vnsrl.wx v8, v10, a1
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v10
+; RVD-NEXT:    vnsrl.wx v10, v8, a1
 ; RVD-NEXT:    li a1, 1054
-; RVD-NEXT:    vrsub.vx v8, v8, a1
+; RVD-NEXT:    vrsub.vx v8, v10, a1
 ; RVD-NEXT:    li a1, 32
 ; RVD-NEXT:    vminu.vx v8, v8, a1
 ; RVD-NEXT:    vse32.v v8, (a0)
@@ -429,12 +429,12 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; RVF-NEXT:    vle8.v v8, (a0)
 ; RVF-NEXT:    li a1, 134
-; RVF-NEXT:    vzext.vf2 v12, v8
-; RVF-NEXT:    vfwcvt.f.xu.v v16, v12
-; RVF-NEXT:    vnsrl.wi v8, v16, 23
+; RVF-NEXT:    vzext.vf2 v16, v8
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v16
+; RVF-NEXT:    vnsrl.wi v16, v8, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; RVF-NEXT:    vnsrl.wi v12, v8, 0
-; RVF-NEXT:    vrsub.vx v8, v12, a1
+; RVF-NEXT:    vnsrl.wi v8, v16, 0
+; RVF-NEXT:    vrsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 8
 ; RVF-NEXT:    vminu.vx v8, v8, a1
 ; RVF-NEXT:    vse8.v v8, (a0)
@@ -446,12 +446,12 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; RVD-NEXT:    vle8.v v8, (a0)
 ; RVD-NEXT:    li a1, 134
-; RVD-NEXT:    vzext.vf2 v12, v8
-; RVD-NEXT:    vfwcvt.f.xu.v v16, v12
-; RVD-NEXT:    vnsrl.wi v8, v16, 23
+; RVD-NEXT:    vzext.vf2 v16, v8
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v16
+; RVD-NEXT:    vnsrl.wi v16, v8, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; RVD-NEXT:    vnsrl.wi v12, v8, 0
-; RVD-NEXT:    vrsub.vx v8, v12, a1
+; RVD-NEXT:    vnsrl.wi v8, v16, 0
+; RVD-NEXT:    vrsub.vx v8, v8, a1
 ; RVD-NEXT:    li a1, 8
 ; RVD-NEXT:    vminu.vx v8, v8, a1
 ; RVD-NEXT:    vse8.v v8, (a0)
@@ -512,11 +512,11 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVF-LABEL: ctlz_v16i16:
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RVF-NEXT:    vle16.v v8, (a0)
+; RVF-NEXT:    vle16.v v12, (a0)
 ; RVF-NEXT:    li a1, 142
-; RVF-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVF-NEXT:    vnsrl.wi v8, v12, 23
-; RVF-NEXT:    vrsub.vx v8, v8, a1
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVF-NEXT:    vnsrl.wi v12, v8, 23
+; RVF-NEXT:    vrsub.vx v8, v12, a1
 ; RVF-NEXT:    li a1, 16
 ; RVF-NEXT:    vminu.vx v8, v8, a1
 ; RVF-NEXT:    vse16.v v8, (a0)
@@ -525,11 +525,11 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVD-LABEL: ctlz_v16i16:
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RVD-NEXT:    vle16.v v8, (a0)
+; RVD-NEXT:    vle16.v v12, (a0)
 ; RVD-NEXT:    li a1, 142
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVD-NEXT:    vnsrl.wi v8, v12, 23
-; RVD-NEXT:    vrsub.vx v8, v8, a1
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVD-NEXT:    vnsrl.wi v12, v8, 23
+; RVD-NEXT:    vrsub.vx v8, v12, a1
 ; RVD-NEXT:    li a1, 16
 ; RVD-NEXT:    vminu.vx v8, v8, a1
 ; RVD-NEXT:    vse16.v v8, (a0)
@@ -607,12 +607,12 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVD-LABEL: ctlz_v8i32:
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    vle32.v v12, (a0)
 ; RVD-NEXT:    li a1, 52
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVD-NEXT:    vnsrl.wx v8, v12, a1
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVD-NEXT:    vnsrl.wx v12, v8, a1
 ; RVD-NEXT:    li a1, 1054
-; RVD-NEXT:    vrsub.vx v8, v8, a1
+; RVD-NEXT:    vrsub.vx v8, v12, a1
 ; RVD-NEXT:    li a1, 32
 ; RVD-NEXT:    vminu.vx v8, v8, a1
 ; RVD-NEXT:    vse32.v v8, (a0)
@@ -747,11 +747,11 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfncvt.f.xu.w v11, v8
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v8, v11, 23
-; RVF-NEXT:    vwsubu.vv v12, v10, v8
+; RVF-NEXT:    vsrl.vi v11, v11, 23
+; RVF-NEXT:    vwsubu.vv v8, v10, v11
 ; RVF-NEXT:    li a1, 64
 ; RVF-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RVF-NEXT:    vminu.vx v8, v12, a1
+; RVF-NEXT:    vminu.vx v8, v8, a1
 ; RVF-NEXT:    vse64.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -817,13 +817,13 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVF-NEXT:    vle8.v v8, (a0)
-; RVF-NEXT:    vzext.vf2 v10, v8
-; RVF-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVF-NEXT:    vnsrl.wi v8, v12, 23
+; RVF-NEXT:    vzext.vf2 v12, v8
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVF-NEXT:    vnsrl.wi v12, v8, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RVF-NEXT:    vnsrl.wi v10, v8, 0
+; RVF-NEXT:    vnsrl.wi v8, v12, 0
 ; RVF-NEXT:    li a1, 134
-; RVF-NEXT:    vrsub.vx v8, v10, a1
+; RVF-NEXT:    vrsub.vx v8, v8, a1
 ; RVF-NEXT:    vse8.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -831,13 +831,13 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVD-NEXT:    vle8.v v8, (a0)
-; RVD-NEXT:    vzext.vf2 v10, v8
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVD-NEXT:    vnsrl.wi v8, v12, 23
+; RVD-NEXT:    vzext.vf2 v12, v8
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVD-NEXT:    vnsrl.wi v12, v8, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RVD-NEXT:    vnsrl.wi v10, v8, 0
+; RVD-NEXT:    vnsrl.wi v8, v12, 0
 ; RVD-NEXT:    li a1, 134
-; RVD-NEXT:    vrsub.vx v8, v10, a1
+; RVD-NEXT:    vrsub.vx v8, v8, a1
 ; RVD-NEXT:    vse8.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -894,22 +894,22 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind {
 ; RVF-LABEL: ctlz_zero_undef_v8i16:
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RVF-NEXT:    vle16.v v8, (a0)
-; RVF-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVF-NEXT:    vnsrl.wi v8, v10, 23
+; RVF-NEXT:    vle16.v v10, (a0)
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v10
+; RVF-NEXT:    vnsrl.wi v10, v8, 23
 ; RVF-NEXT:    li a1, 142
-; RVF-NEXT:    vrsub.vx v8, v8, a1
+; RVF-NEXT:    vrsub.vx v8, v10, a1
 ; RVF-NEXT:    vse16.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
 ; RVD-LABEL: ctlz_zero_undef_v8i16:
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RVD-NEXT:    vle16.v v8, (a0)
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVD-NEXT:    vnsrl.wi v8, v10, 23
+; RVD-NEXT:    vle16.v v10, (a0)
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v10
+; RVD-NEXT:    vnsrl.wi v10, v8, 23
 ; RVD-NEXT:    li a1, 142
-; RVD-NEXT:    vrsub.vx v8, v8, a1
+; RVD-NEXT:    vrsub.vx v8, v10, a1
 ; RVD-NEXT:    vse16.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -982,12 +982,12 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVD-LABEL: ctlz_zero_undef_v4i32:
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    vle32.v v10, (a0)
 ; RVD-NEXT:    li a1, 52
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVD-NEXT:    vnsrl.wx v8, v10, a1
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v10
+; RVD-NEXT:    vnsrl.wx v10, v8, a1
 ; RVD-NEXT:    li a1, 1054
-; RVD-NEXT:    vrsub.vx v8, v8, a1
+; RVD-NEXT:    vrsub.vx v8, v10, a1
 ; RVD-NEXT:    vse32.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -1185,13 +1185,13 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    li a1, 32
 ; RVF-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; RVF-NEXT:    vle8.v v8, (a0)
-; RVF-NEXT:    vzext.vf2 v12, v8
-; RVF-NEXT:    vfwcvt.f.xu.v v16, v12
-; RVF-NEXT:    vnsrl.wi v8, v16, 23
+; RVF-NEXT:    vzext.vf2 v16, v8
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v16
+; RVF-NEXT:    vnsrl.wi v16, v8, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; RVF-NEXT:    vnsrl.wi v12, v8, 0
+; RVF-NEXT:    vnsrl.wi v8, v16, 0
 ; RVF-NEXT:    li a1, 134
-; RVF-NEXT:    vrsub.vx v8, v12, a1
+; RVF-NEXT:    vrsub.vx v8, v8, a1
 ; RVF-NEXT:    vse8.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -1200,13 +1200,13 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    li a1, 32
 ; RVD-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; RVD-NEXT:    vle8.v v8, (a0)
-; RVD-NEXT:    vzext.vf2 v12, v8
-; RVD-NEXT:    vfwcvt.f.xu.v v16, v12
-; RVD-NEXT:    vnsrl.wi v8, v16, 23
+; RVD-NEXT:    vzext.vf2 v16, v8
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v16
+; RVD-NEXT:    vnsrl.wi v16, v8, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; RVD-NEXT:    vnsrl.wi v12, v8, 0
+; RVD-NEXT:    vnsrl.wi v8, v16, 0
 ; RVD-NEXT:    li a1, 134
-; RVD-NEXT:    vrsub.vx v8, v12, a1
+; RVD-NEXT:    vrsub.vx v8, v8, a1
 ; RVD-NEXT:    vse8.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -1264,22 +1264,22 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind {
 ; RVF-LABEL: ctlz_zero_undef_v16i16:
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RVF-NEXT:    vle16.v v8, (a0)
-; RVF-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVF-NEXT:    vnsrl.wi v8, v12, 23
+; RVF-NEXT:    vle16.v v12, (a0)
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVF-NEXT:    vnsrl.wi v12, v8, 23
 ; RVF-NEXT:    li a1, 142
-; RVF-NEXT:    vrsub.vx v8, v8, a1
+; RVF-NEXT:    vrsub.vx v8, v12, a1
 ; RVF-NEXT:    vse16.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
 ; RVD-LABEL: ctlz_zero_undef_v16i16:
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RVD-NEXT:    vle16.v v8, (a0)
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVD-NEXT:    vnsrl.wi v8, v12, 23
+; RVD-NEXT:    vle16.v v12, (a0)
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVD-NEXT:    vnsrl.wi v12, v8, 23
 ; RVD-NEXT:    li a1, 142
-; RVD-NEXT:    vrsub.vx v8, v8, a1
+; RVD-NEXT:    vrsub.vx v8, v12, a1
 ; RVD-NEXT:    vse16.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -1352,12 +1352,12 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVD-LABEL: ctlz_zero_undef_v8i32:
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    vle32.v v12, (a0)
 ; RVD-NEXT:    li a1, 52
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVD-NEXT:    vnsrl.wx v8, v12, a1
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVD-NEXT:    vnsrl.wx v12, v8, a1
 ; RVD-NEXT:    li a1, 1054
-; RVD-NEXT:    vrsub.vx v8, v8, a1
+; RVD-NEXT:    vrsub.vx v8, v12, a1
 ; RVD-NEXT:    vse32.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -1489,9 +1489,9 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfncvt.f.xu.w v11, v8
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v8, v11, 23
-; RVF-NEXT:    vwsubu.vv v12, v10, v8
-; RVF-NEXT:    vse64.v v12, (a0)
+; RVF-NEXT:    vsrl.vi v11, v11, 23
+; RVF-NEXT:    vwsubu.vv v8, v10, v11
+; RVF-NEXT:    vse64.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
 ; RVD-LABEL: ctlz_zero_undef_v4i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index 1724b92a9ab48..d884cece89507 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -34,18 +34,18 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVF-LABEL: cttz_v16i8:
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RVF-NEXT:    vle8.v v8, (a0)
+; RVF-NEXT:    vle8.v v14, (a0)
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vrsub.vi v9, v8, 0
-; RVF-NEXT:    vand.vv v9, v8, v9
+; RVF-NEXT:    vrsub.vi v8, v14, 0
+; RVF-NEXT:    vand.vv v8, v14, v8
 ; RVF-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RVF-NEXT:    vzext.vf2 v10, v9
-; RVF-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVF-NEXT:    vnsrl.wi v10, v12, 23
+; RVF-NEXT:    vzext.vf2 v12, v8
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVF-NEXT:    vnsrl.wi v12, v8, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RVF-NEXT:    vnsrl.wi v9, v10, 0
-; RVF-NEXT:    vmseq.vi v0, v8, 0
-; RVF-NEXT:    vsub.vx v8, v9, a1
+; RVF-NEXT:    vnsrl.wi v8, v12, 0
+; RVF-NEXT:    vmseq.vi v0, v14, 0
+; RVF-NEXT:    vsub.vx v8, v8, a1
 ; RVF-NEXT:    vmerge.vim v8, v8, 8, v0
 ; RVF-NEXT:    vse8.v v8, (a0)
 ; RVF-NEXT:    ret
@@ -53,18 +53,18 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVD-LABEL: cttz_v16i8:
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RVD-NEXT:    vle8.v v8, (a0)
+; RVD-NEXT:    vle8.v v14, (a0)
 ; RVD-NEXT:    li a1, 127
-; RVD-NEXT:    vrsub.vi v9, v8, 0
-; RVD-NEXT:    vand.vv v9, v8, v9
+; RVD-NEXT:    vrsub.vi v8, v14, 0
+; RVD-NEXT:    vand.vv v8, v14, v8
 ; RVD-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RVD-NEXT:    vzext.vf2 v10, v9
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVD-NEXT:    vnsrl.wi v10, v12, 23
+; RVD-NEXT:    vzext.vf2 v12, v8
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVD-NEXT:    vnsrl.wi v12, v8, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RVD-NEXT:    vnsrl.wi v9, v10, 0
-; RVD-NEXT:    vmseq.vi v0, v8, 0
-; RVD-NEXT:    vsub.vx v8, v9, a1
+; RVD-NEXT:    vnsrl.wi v8, v12, 0
+; RVD-NEXT:    vmseq.vi v0, v14, 0
+; RVD-NEXT:    vsub.vx v8, v8, a1
 ; RVD-NEXT:    vmerge.vim v8, v8, 8, v0
 ; RVD-NEXT:    vse8.v v8, (a0)
 ; RVD-NEXT:    ret
@@ -121,10 +121,10 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    li a1, 127
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
-; RVF-NEXT:    vand.vv v8, v8, v9
-; RVF-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVF-NEXT:    vnsrl.wi v8, v10, 23
-; RVF-NEXT:    vsub.vx v8, v8, a1
+; RVF-NEXT:    vand.vv v10, v8, v9
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v10
+; RVF-NEXT:    vnsrl.wi v10, v8, 23
+; RVF-NEXT:    vsub.vx v8, v10, a1
 ; RVF-NEXT:    li a1, 16
 ; RVF-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVF-NEXT:    vse16.v v8, (a0)
@@ -137,10 +137,10 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    li a1, 127
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
-; RVD-NEXT:    vand.vv v8, v8, v9
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVD-NEXT:    vnsrl.wi v8, v10, 23
-; RVD-NEXT:    vsub.vx v8, v8, a1
+; RVD-NEXT:    vand.vv v10, v8, v9
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v10
+; RVD-NEXT:    vnsrl.wi v10, v8, 23
+; RVD-NEXT:    vsub.vx v8, v10, a1
 ; RVD-NEXT:    li a1, 16
 ; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse16.v v8, (a0)
@@ -213,15 +213,15 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVD-LABEL: cttz_v4i32:
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    vle32.v v10, (a0)
 ; RVD-NEXT:    li a1, 52
-; RVD-NEXT:    vrsub.vi v9, v8, 0
-; RVD-NEXT:    vand.vv v9, v8, v9
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v9
-; RVD-NEXT:    vnsrl.wx v9, v10, a1
+; RVD-NEXT:    vrsub.vi v8, v10, 0
+; RVD-NEXT:    vand.vv v11, v10, v8
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v11
+; RVD-NEXT:    vnsrl.wx v11, v8, a1
 ; RVD-NEXT:    li a1, 1023
-; RVD-NEXT:    vmseq.vi v0, v8, 0
-; RVD-NEXT:    vsub.vx v8, v9, a1
+; RVD-NEXT:    vmseq.vi v0, v10, 0
+; RVD-NEXT:    vsub.vx v8, v11, a1
 ; RVD-NEXT:    li a1, 32
 ; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse32.v v8, (a0)
@@ -409,18 +409,18 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    li a1, 32
 ; RVF-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RVF-NEXT:    vle8.v v8, (a0)
+; RVF-NEXT:    vle8.v v20, (a0)
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vrsub.vi v10, v8, 0
-; RVF-NEXT:    vand.vv v10, v8, v10
+; RVF-NEXT:    vrsub.vi v8, v20, 0
+; RVF-NEXT:    vand.vv v8, v20, v8
 ; RVF-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; RVF-NEXT:    vzext.vf2 v12, v10
-; RVF-NEXT:    vfwcvt.f.xu.v v16, v12
-; RVF-NEXT:    vnsrl.wi v12, v16, 23
+; RVF-NEXT:    vzext.vf2 v16, v8
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v16
+; RVF-NEXT:    vnsrl.wi v16, v8, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; RVF-NEXT:    vnsrl.wi v10, v12, 0
-; RVF-NEXT:    vmseq.vi v0, v8, 0
-; RVF-NEXT:    vsub.vx v8, v10, a1
+; RVF-NEXT:    vnsrl.wi v8, v16, 0
+; RVF-NEXT:    vmseq.vi v0, v20, 0
+; RVF-NEXT:    vsub.vx v8, v8, a1
 ; RVF-NEXT:    vmerge.vim v8, v8, 8, v0
 ; RVF-NEXT:    vse8.v v8, (a0)
 ; RVF-NEXT:    ret
@@ -429,18 +429,18 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    li a1, 32
 ; RVD-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RVD-NEXT:    vle8.v v8, (a0)
+; RVD-NEXT:    vle8.v v20, (a0)
 ; RVD-NEXT:    li a1, 127
-; RVD-NEXT:    vrsub.vi v10, v8, 0
-; RVD-NEXT:    vand.vv v10, v8, v10
+; RVD-NEXT:    vrsub.vi v8, v20, 0
+; RVD-NEXT:    vand.vv v8, v20, v8
 ; RVD-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; RVD-NEXT:    vzext.vf2 v12, v10
-; RVD-NEXT:    vfwcvt.f.xu.v v16, v12
-; RVD-NEXT:    vnsrl.wi v12, v16, 23
+; RVD-NEXT:    vzext.vf2 v16, v8
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v16
+; RVD-NEXT:    vnsrl.wi v16, v8, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; RVD-NEXT:    vnsrl.wi v10, v12, 0
-; RVD-NEXT:    vmseq.vi v0, v8, 0
-; RVD-NEXT:    vsub.vx v8, v10, a1
+; RVD-NEXT:    vnsrl.wi v8, v16, 0
+; RVD-NEXT:    vmseq.vi v0, v20, 0
+; RVD-NEXT:    vsub.vx v8, v8, a1
 ; RVD-NEXT:    vmerge.vim v8, v8, 8, v0
 ; RVD-NEXT:    vse8.v v8, (a0)
 ; RVD-NEXT:    ret
@@ -498,10 +498,10 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    li a1, 127
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
-; RVF-NEXT:    vand.vv v8, v8, v10
-; RVF-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVF-NEXT:    vnsrl.wi v8, v12, 23
-; RVF-NEXT:    vsub.vx v8, v8, a1
+; RVF-NEXT:    vand.vv v12, v8, v10
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVF-NEXT:    vnsrl.wi v12, v8, 23
+; RVF-NEXT:    vsub.vx v8, v12, a1
 ; RVF-NEXT:    li a1, 16
 ; RVF-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVF-NEXT:    vse16.v v8, (a0)
@@ -514,10 +514,10 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    li a1, 127
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
-; RVD-NEXT:    vand.vv v8, v8, v10
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVD-NEXT:    vnsrl.wi v8, v12, 23
-; RVD-NEXT:    vsub.vx v8, v8, a1
+; RVD-NEXT:    vand.vv v12, v8, v10
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVD-NEXT:    vnsrl.wi v12, v8, 23
+; RVD-NEXT:    vsub.vx v8, v12, a1
 ; RVD-NEXT:    li a1, 16
 ; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse16.v v8, (a0)
@@ -590,15 +590,15 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVD-LABEL: cttz_v8i32:
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    vle32.v v12, (a0)
 ; RVD-NEXT:    li a1, 52
-; RVD-NEXT:    vrsub.vi v10, v8, 0
-; RVD-NEXT:    vand.vv v10, v8, v10
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVD-NEXT:    vnsrl.wx v10, v12, a1
+; RVD-NEXT:    vrsub.vi v8, v12, 0
+; RVD-NEXT:    vand.vv v14, v12, v8
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v14
+; RVD-NEXT:    vnsrl.wx v14, v8, a1
 ; RVD-NEXT:    li a1, 1023
-; RVD-NEXT:    vmseq.vi v0, v8, 0
-; RVD-NEXT:    vsub.vx v8, v10, a1
+; RVD-NEXT:    vmseq.vi v0, v12, 0
+; RVD-NEXT:    vsub.vx v8, v14, a1
 ; RVD-NEXT:    li a1, 32
 ; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse32.v v8, (a0)
@@ -716,11 +716,11 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
 ; RVF-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RVF-NEXT:    vsrl.vi v8, v12, 23
-; RVF-NEXT:    vwsubu.vx v10, v8, a1
+; RVF-NEXT:    vsrl.vi v10, v12, 23
+; RVF-NEXT:    vwsubu.vx v8, v10, a1
 ; RVF-NEXT:    li a1, 64
 ; RVF-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RVF-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVF-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVF-NEXT:    vse64.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -788,13 +788,13 @@ define void @cttz_zero_undef_v16i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vand.vv v8, v8, v9
 ; RVF-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RVF-NEXT:    vzext.vf2 v10, v8
-; RVF-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVF-NEXT:    vnsrl.wi v8, v12, 23
+; RVF-NEXT:    vzext.vf2 v12, v8
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVF-NEXT:    vnsrl.wi v12, v8, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RVF-NEXT:    vnsrl.wi v10, v8, 0
+; RVF-NEXT:    vnsrl.wi v8, v12, 0
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vsub.vx v8, v10, a1
+; RVF-NEXT:    vsub.vx v8, v8, a1
 ; RVF-NEXT:    vse8.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -805,13 +805,13 @@ define void @cttz_zero_undef_v16i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vand.vv v8, v8, v9
 ; RVD-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RVD-NEXT:    vzext.vf2 v10, v8
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVD-NEXT:    vnsrl.wi v8, v12, 23
+; RVD-NEXT:    vzext.vf2 v12, v8
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVD-NEXT:    vnsrl.wi v12, v8, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RVD-NEXT:    vnsrl.wi v10, v8, 0
+; RVD-NEXT:    vnsrl.wi v8, v12, 0
 ; RVD-NEXT:    li a1, 127
-; RVD-NEXT:    vsub.vx v8, v10, a1
+; RVD-NEXT:    vsub.vx v8, v8, a1
 ; RVD-NEXT:    vse8.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -864,11 +864,11 @@ define void @cttz_zero_undef_v8i16(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVF-NEXT:    vle16.v v8, (a0)
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
-; RVF-NEXT:    vand.vv v8, v8, v9
-; RVF-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVF-NEXT:    vnsrl.wi v8, v10, 23
+; RVF-NEXT:    vand.vv v10, v8, v9
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v10
+; RVF-NEXT:    vnsrl.wi v10, v8, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vsub.vx v8, v8, a1
+; RVF-NEXT:    vsub.vx v8, v10, a1
 ; RVF-NEXT:    vse16.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -877,11 +877,11 @@ define void @cttz_zero_undef_v8i16(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVD-NEXT:    vle16.v v8, (a0)
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
-; RVD-NEXT:    vand.vv v8, v8, v9
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVD-NEXT:    vnsrl.wi v8, v10, 23
+; RVD-NEXT:    vand.vv v10, v8, v9
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v10
+; RVD-NEXT:    vnsrl.wi v10, v8, 23
 ; RVD-NEXT:    li a1, 127
-; RVD-NEXT:    vsub.vx v8, v8, a1
+; RVD-NEXT:    vsub.vx v8, v10, a1
 ; RVD-NEXT:    vse16.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -951,11 +951,11 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vle32.v v8, (a0)
 ; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
-; RVD-NEXT:    vand.vv v8, v8, v9
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVD-NEXT:    vnsrl.wx v8, v10, a1
+; RVD-NEXT:    vand.vv v10, v8, v9
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v10
+; RVD-NEXT:    vnsrl.wx v10, v8, a1
 ; RVD-NEXT:    li a1, 1023
-; RVD-NEXT:    vsub.vx v8, v8, a1
+; RVD-NEXT:    vsub.vx v8, v10, a1
 ; RVD-NEXT:    vse32.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -1134,13 +1134,13 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vand.vv v8, v8, v10
 ; RVF-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; RVF-NEXT:    vzext.vf2 v12, v8
-; RVF-NEXT:    vfwcvt.f.xu.v v16, v12
-; RVF-NEXT:    vnsrl.wi v8, v16, 23
+; RVF-NEXT:    vzext.vf2 v16, v8
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v16
+; RVF-NEXT:    vnsrl.wi v16, v8, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; RVF-NEXT:    vnsrl.wi v12, v8, 0
+; RVF-NEXT:    vnsrl.wi v8, v16, 0
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vsub.vx v8, v12, a1
+; RVF-NEXT:    vsub.vx v8, v8, a1
 ; RVF-NEXT:    vse8.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -1152,13 +1152,13 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vand.vv v8, v8, v10
 ; RVD-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; RVD-NEXT:    vzext.vf2 v12, v8
-; RVD-NEXT:    vfwcvt.f.xu.v v16, v12
-; RVD-NEXT:    vnsrl.wi v8, v16, 23
+; RVD-NEXT:    vzext.vf2 v16, v8
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v16
+; RVD-NEXT:    vnsrl.wi v16, v8, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; RVD-NEXT:    vnsrl.wi v12, v8, 0
+; RVD-NEXT:    vnsrl.wi v8, v16, 0
 ; RVD-NEXT:    li a1, 127
-; RVD-NEXT:    vsub.vx v8, v12, a1
+; RVD-NEXT:    vsub.vx v8, v8, a1
 ; RVD-NEXT:    vse8.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -1212,11 +1212,11 @@ define void @cttz_zero_undef_v16i16(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVF-NEXT:    vle16.v v8, (a0)
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
-; RVF-NEXT:    vand.vv v8, v8, v10
-; RVF-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVF-NEXT:    vnsrl.wi v8, v12, 23
+; RVF-NEXT:    vand.vv v12, v8, v10
+; RVF-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVF-NEXT:    vnsrl.wi v12, v8, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vsub.vx v8, v8, a1
+; RVF-NEXT:    vsub.vx v8, v12, a1
 ; RVF-NEXT:    vse16.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -1225,11 +1225,11 @@ define void @cttz_zero_undef_v16i16(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVD-NEXT:    vle16.v v8, (a0)
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
-; RVD-NEXT:    vand.vv v8, v8, v10
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVD-NEXT:    vnsrl.wi v8, v12, 23
+; RVD-NEXT:    vand.vv v12, v8, v10
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVD-NEXT:    vnsrl.wi v12, v8, 23
 ; RVD-NEXT:    li a1, 127
-; RVD-NEXT:    vsub.vx v8, v8, a1
+; RVD-NEXT:    vsub.vx v8, v12, a1
 ; RVD-NEXT:    vse16.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -1299,11 +1299,11 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vle32.v v8, (a0)
 ; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
-; RVD-NEXT:    vand.vv v8, v8, v10
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVD-NEXT:    vnsrl.wx v8, v12, a1
+; RVD-NEXT:    vand.vv v12, v8, v10
+; RVD-NEXT:    vfwcvt.f.xu.v v8, v12
+; RVD-NEXT:    vnsrl.wx v12, v8, a1
 ; RVD-NEXT:    li a1, 1023
-; RVD-NEXT:    vsub.vx v8, v8, a1
+; RVD-NEXT:    vsub.vx v8, v12, a1
 ; RVD-NEXT:    vse32.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -1414,10 +1414,10 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RVF-NEXT:    vfncvt.f.xu.w v10, v8
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v8, v10, 23
+; RVF-NEXT:    vsrl.vi v10, v10, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vwsubu.vx v10, v8, a1
-; RVF-NEXT:    vse64.v v10, (a0)
+; RVF-NEXT:    vwsubu.vx v8, v10, a1
+; RVF-NEXT:    vse64.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
 ; RVD-LABEL: cttz_zero_undef_v4i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
index ff52f5d2039e1..78ad10033feaa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
@@ -36,9 +36,9 @@ define void @fpext_v8f16_v8f32(ptr %x, ptr %y) {
 ; CHECK-LABEL: fpext_v8f16_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
-; CHECK-NEXT:    vse32.v v10, (a1)
+; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
+; CHECK-NEXT:    vse32.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %d = fpext <8 x half> %a to <8 x float>
@@ -51,10 +51,10 @@ define void @fpext_v8f16_v8f64(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v12, v10
-; CHECK-NEXT:    vse64.v v12, (a1)
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v12
+; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %d = fpext <8 x half> %a to <8 x double>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index 7817f010c4deb..9102cd6ed7036 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -90,10 +90,11 @@ define <8 x float> @interleave_v4f32(<4 x float> %x, <4 x float> %y) {
 ; V128-LABEL: interleave_v4f32:
 ; V128:       # %bb.0:
 ; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; V128-NEXT:    vwaddu.vv v10, v8, v9
+; V128-NEXT:    vmv1r.v v10, v9
+; V128-NEXT:    vmv1r.v v11, v8
+; V128-NEXT:    vwaddu.vv v8, v11, v10
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v10, a0, v9
-; V128-NEXT:    vmv2r.v v8, v10
+; V128-NEXT:    vwmaccu.vx v8, a0, v10
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v4f32:
@@ -113,10 +114,11 @@ define <16 x half> @interleave_v8f16(<8 x half> %x, <8 x half> %y) {
 ; V128-LABEL: interleave_v8f16:
 ; V128:       # %bb.0:
 ; V128-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; V128-NEXT:    vwaddu.vv v10, v9, v8
+; V128-NEXT:    vmv1r.v v10, v9
+; V128-NEXT:    vmv1r.v v11, v8
+; V128-NEXT:    vwaddu.vv v8, v10, v11
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v10, a0, v8
-; V128-NEXT:    vmv2r.v v8, v10
+; V128-NEXT:    vwmaccu.vx v8, a0, v11
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v8f16:
@@ -135,10 +137,11 @@ define <16 x float> @interleave_v8f32(<8 x float> %x, <8 x float> %y) {
 ; V128-LABEL: interleave_v8f32:
 ; V128:       # %bb.0:
 ; V128-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; V128-NEXT:    vwaddu.vv v12, v8, v10
+; V128-NEXT:    vmv2r.v v12, v10
+; V128-NEXT:    vmv2r.v v14, v8
+; V128-NEXT:    vwaddu.vv v8, v14, v12
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v12, a0, v10
-; V128-NEXT:    vmv4r.v v8, v12
+; V128-NEXT:    vwmaccu.vx v8, a0, v12
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v8f32:
@@ -157,10 +160,11 @@ define <32 x half> @interleave_v16f16(<16 x half> %x, <16 x half> %y) {
 ; V128-LABEL: interleave_v16f16:
 ; V128:       # %bb.0:
 ; V128-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; V128-NEXT:    vwaddu.vv v12, v8, v10
+; V128-NEXT:    vmv2r.v v12, v10
+; V128-NEXT:    vmv2r.v v14, v8
+; V128-NEXT:    vwaddu.vv v8, v14, v12
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v12, a0, v10
-; V128-NEXT:    vmv4r.v v8, v12
+; V128-NEXT:    vwmaccu.vx v8, a0, v12
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v16f16:
@@ -179,19 +183,21 @@ define <32 x float> @interleave_v16f32(<16 x float> %x, <16 x float> %y) {
 ; V128-LABEL: interleave_v16f32:
 ; V128:       # %bb.0:
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v16, v8, v12
+; V128-NEXT:    vmv4r.v v16, v12
+; V128-NEXT:    vmv4r.v v20, v8
+; V128-NEXT:    vwaddu.vv v8, v20, v16
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v16, a0, v12
-; V128-NEXT:    vmv8r.v v8, v16
+; V128-NEXT:    vwmaccu.vx v8, a0, v16
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v16f32:
 ; V512:       # %bb.0:
 ; V512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
-; V512-NEXT:    vwaddu.vv v10, v8, v9
+; V512-NEXT:    vmv1r.v v10, v9
+; V512-NEXT:    vmv1r.v v11, v8
+; V512-NEXT:    vwaddu.vv v8, v11, v10
 ; V512-NEXT:    li a0, -1
-; V512-NEXT:    vwmaccu.vx v10, a0, v9
-; V512-NEXT:    vmv2r.v v8, v10
+; V512-NEXT:    vwmaccu.vx v8, a0, v10
 ; V512-NEXT:    ret
   %a = shufflevector <16 x float> %x, <16 x float> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   ret <32 x float> %a
@@ -200,22 +206,26 @@ define <32 x float> @interleave_v16f32(<16 x float> %x, <16 x float> %y) {
 define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) {
 ; V128-LABEL: interleave_v32f16:
 ; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; V128-NEXT:    vmv4r.v v16, v12
+; V128-NEXT:    vmv4r.v v20, v8
 ; V128-NEXT:    li a0, 32
 ; V128-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v16, v8, v12
+; V128-NEXT:    vwaddu.vv v8, v20, v16
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v16, a0, v12
-; V128-NEXT:    vmv8r.v v8, v16
+; V128-NEXT:    vwmaccu.vx v8, a0, v16
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v32f16:
 ; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; V512-NEXT:    vmv1r.v v10, v9
+; V512-NEXT:    vmv1r.v v11, v8
 ; V512-NEXT:    li a0, 32
 ; V512-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; V512-NEXT:    vwaddu.vv v10, v8, v9
+; V512-NEXT:    vwaddu.vv v8, v11, v10
 ; V512-NEXT:    li a0, -1
-; V512-NEXT:    vwmaccu.vx v10, a0, v9
-; V512-NEXT:    vmv2r.v v8, v10
+; V512-NEXT:    vwmaccu.vx v8, a0, v10
 ; V512-NEXT:    ret
   %a = shufflevector <32 x half> %x, <32 x half> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   ret <64 x half> %a
@@ -263,12 +273,14 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
 ;
 ; V512-LABEL: interleave_v32f32:
 ; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; V512-NEXT:    vmv2r.v v12, v10
+; V512-NEXT:    vmv2r.v v14, v8
 ; V512-NEXT:    li a0, 32
 ; V512-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; V512-NEXT:    vwaddu.vv v12, v8, v10
+; V512-NEXT:    vwaddu.vv v8, v14, v12
 ; V512-NEXT:    li a0, -1
-; V512-NEXT:    vwmaccu.vx v12, a0, v10
-; V512-NEXT:    vmv4r.v v8, v12
+; V512-NEXT:    vwmaccu.vx v8, a0, v12
 ; V512-NEXT:    ret
   %a = shufflevector <32 x float> %x, <32 x float> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   ret <64 x float> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
index c14eae0b1de61..abb929eaaf6e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
@@ -17,13 +17,13 @@ define void @fcmp_oeq_vv_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v10
-; ZVFHMIN-NEXT:    vsm.v v8, (a2)
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v10, v8
+; ZVFHMIN-NEXT:    vsm.v v12, (a2)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -45,13 +45,13 @@ define void @fcmp_oeq_vv_v8f16_nonans(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_v8f16_nonans:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v10
-; ZVFHMIN-NEXT:    vsm.v v8, (a2)
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v10, v8
+; ZVFHMIN-NEXT:    vsm.v v12, (a2)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -173,13 +173,13 @@ define void @fcmp_olt_vv_v16f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_olt_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vle16.v v12, (a1)
+; ZVFHMIN-NEXT:    vle16.v v16, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12
-; ZVFHMIN-NEXT:    vsm.v v8, (a2)
+; ZVFHMIN-NEXT:    vmflt.vv v16, v12, v8
+; ZVFHMIN-NEXT:    vsm.v v16, (a2)
 ; ZVFHMIN-NEXT:    ret
   %a = load <16 x half>, ptr %x
   %b = load <16 x half>, ptr %y
@@ -201,13 +201,13 @@ define void @fcmp_olt_vv_v16f16_nonans(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_olt_vv_v16f16_nonans:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vle16.v v12, (a1)
+; ZVFHMIN-NEXT:    vle16.v v16, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12
-; ZVFHMIN-NEXT:    vsm.v v8, (a2)
+; ZVFHMIN-NEXT:    vmflt.vv v16, v12, v8
+; ZVFHMIN-NEXT:    vsm.v v16, (a2)
 ; ZVFHMIN-NEXT:    ret
   %a = load <16 x half>, ptr %x
   %b = load <16 x half>, ptr %y
@@ -314,13 +314,13 @@ define void @fcmp_ule_vv_v32f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a3, 32
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    vle16.v v12, (a1)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vle16.v v16, (a0)
+; ZVFHMIN-NEXT:    vle16.v v24, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v24, v16
-; ZVFHMIN-NEXT:    vmnot.m v8, v8
+; ZVFHMIN-NEXT:    vmflt.vv v24, v16, v8
+; ZVFHMIN-NEXT:    vmnot.m v8, v24
 ; ZVFHMIN-NEXT:    vsm.v v8, (a2)
 ; ZVFHMIN-NEXT:    ret
   %a = load <32 x half>, ptr %x
@@ -345,13 +345,13 @@ define void @fcmp_ule_vv_v32f16_nonans(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a3, 32
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v12, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vle16.v v16, (a1)
+; ZVFHMIN-NEXT:    vle16.v v24, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfle.vv v8, v24, v16
-; ZVFHMIN-NEXT:    vsm.v v8, (a2)
+; ZVFHMIN-NEXT:    vmfle.vv v24, v16, v8
+; ZVFHMIN-NEXT:    vsm.v v24, (a2)
 ; ZVFHMIN-NEXT:    ret
   %a = load <32 x half>, ptr %x
   %b = load <32 x half>, ptr %y
@@ -643,14 +643,14 @@ define void @fcmp_oeq_vf_v8f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v12
-; ZVFHMIN-NEXT:    vsm.v v8, (a1)
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v8, v10
+; ZVFHMIN-NEXT:    vsm.v v12, (a1)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = insertelement <8 x half> poison, half %y, i32 0
@@ -672,14 +672,14 @@ define void @fcmp_oeq_vf_v8f16_nonans(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_v8f16_nonans:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v12
-; ZVFHMIN-NEXT:    vsm.v v8, (a1)
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v8, v10
+; ZVFHMIN-NEXT:    vsm.v v12, (a1)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = insertelement <8 x half> poison, half %y, i32 0
@@ -801,14 +801,14 @@ define void @fcmp_olt_vf_v16f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_olt_vf_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16
-; ZVFHMIN-NEXT:    vsm.v v8, (a1)
+; ZVFHMIN-NEXT:    vmflt.vv v16, v8, v12
+; ZVFHMIN-NEXT:    vsm.v v16, (a1)
 ; ZVFHMIN-NEXT:    ret
   %a = load <16 x half>, ptr %x
   %b = insertelement <16 x half> poison, half %y, i32 0
@@ -830,14 +830,14 @@ define void @fcmp_olt_vf_v16f16_nonans(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_olt_vf_v16f16_nonans:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16
-; ZVFHMIN-NEXT:    vsm.v v8, (a1)
+; ZVFHMIN-NEXT:    vmflt.vv v16, v8, v12
+; ZVFHMIN-NEXT:    vsm.v v16, (a1)
 ; ZVFHMIN-NEXT:    ret
   %a = load <16 x half>, ptr %x
   %b = insertelement <16 x half> poison, half %y, i32 0
@@ -944,14 +944,14 @@ define void @fcmp_ule_vf_v32f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a2, 32
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v16, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v24, v16
-; ZVFHMIN-NEXT:    vmnot.m v8, v8
+; ZVFHMIN-NEXT:    vmflt.vv v24, v16, v8
+; ZVFHMIN-NEXT:    vmnot.m v8, v24
 ; ZVFHMIN-NEXT:    vsm.v v8, (a1)
 ; ZVFHMIN-NEXT:    ret
   %a = load <32 x half>, ptr %x
@@ -976,14 +976,14 @@ define void @fcmp_ule_vf_v32f16_nonans(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a2, 32
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v16, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v24
-; ZVFHMIN-NEXT:    vsm.v v8, (a1)
+; ZVFHMIN-NEXT:    vmfle.vv v24, v8, v16
+; ZVFHMIN-NEXT:    vsm.v v24, (a1)
 ; ZVFHMIN-NEXT:    ret
   %a = load <32 x half>, ptr %x
   %b = insertelement <32 x half> poison, half %y, i32 0
@@ -1284,14 +1284,14 @@ define void @fcmp_oeq_fv_v8f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_oeq_fv_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v10
-; ZVFHMIN-NEXT:    vsm.v v8, (a1)
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v10, v8
+; ZVFHMIN-NEXT:    vsm.v v12, (a1)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = insertelement <8 x half> poison, half %y, i32 0
@@ -1313,14 +1313,14 @@ define void @fcmp_oeq_fv_v8f16_nonans(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_oeq_fv_v8f16_nonans:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v10
-; ZVFHMIN-NEXT:    vsm.v v8, (a1)
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v10, v8
+; ZVFHMIN-NEXT:    vsm.v v12, (a1)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = insertelement <8 x half> poison, half %y, i32 0
@@ -1442,14 +1442,14 @@ define void @fcmp_olt_fv_v16f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_olt_fv_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12
-; ZVFHMIN-NEXT:    vsm.v v8, (a1)
+; ZVFHMIN-NEXT:    vmflt.vv v16, v12, v8
+; ZVFHMIN-NEXT:    vsm.v v16, (a1)
 ; ZVFHMIN-NEXT:    ret
   %a = load <16 x half>, ptr %x
   %b = insertelement <16 x half> poison, half %y, i32 0
@@ -1471,14 +1471,14 @@ define void @fcmp_olt_fv_v16f16_nonans(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_olt_fv_v16f16_nonans:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12
-; ZVFHMIN-NEXT:    vsm.v v8, (a1)
+; ZVFHMIN-NEXT:    vmflt.vv v16, v12, v8
+; ZVFHMIN-NEXT:    vsm.v v16, (a1)
 ; ZVFHMIN-NEXT:    ret
   %a = load <16 x half>, ptr %x
   %b = insertelement <16 x half> poison, half %y, i32 0
@@ -1585,14 +1585,14 @@ define void @fcmp_ule_fv_v32f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a2, 32
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v16, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v24
-; ZVFHMIN-NEXT:    vmnot.m v8, v8
+; ZVFHMIN-NEXT:    vmflt.vv v24, v8, v16
+; ZVFHMIN-NEXT:    vmnot.m v8, v24
 ; ZVFHMIN-NEXT:    vsm.v v8, (a1)
 ; ZVFHMIN-NEXT:    ret
   %a = load <32 x half>, ptr %x
@@ -1617,14 +1617,14 @@ define void @fcmp_ule_fv_v32f16_nonans(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a2, 32
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v16, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfle.vv v8, v24, v16
-; ZVFHMIN-NEXT:    vsm.v v8, (a1)
+; ZVFHMIN-NEXT:    vmfle.vv v24, v16, v8
+; ZVFHMIN-NEXT:    vsm.v v24, (a1)
 ; ZVFHMIN-NEXT:    ret
   %a = load <32 x half>, ptr %x
   %b = insertelement <32 x half> poison, half %y, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index a749736097331..4812b27f442c0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -381,15 +381,15 @@ define <16 x float> @shuffle_disjoint_lanes(<16 x float> %v, <16 x float> %w) {
 ; CHECK-NEXT:    lui a0, %hi(.LCPI30_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI30_0)
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    vle8.v v18, (a0)
 ; CHECK-NEXT:    lui a0, 11
 ; CHECK-NEXT:    addi a0, a0, -1366
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vmerge.vvm v12, v12, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v18, v16
+; CHECK-NEXT:    vsext.vf2 v16, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v18
+; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16
 ; CHECK-NEXT:    ret
   %out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
   ret <16 x float> %out
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index c885b3c03270c..38df622998bf9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -9,12 +9,12 @@ define void @fadd_v8bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fadd_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v8, v12, v10
+; CHECK-NEXT:    vfadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -30,12 +30,12 @@ define void @fadd_v6bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fadd_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v8, v12, v10
+; CHECK-NEXT:    vfadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -60,12 +60,12 @@ define void @fadd_v8f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fadd_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -90,12 +90,12 @@ define void @fadd_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fadd_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -143,12 +143,12 @@ define void @fsub_v8bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fsub_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsub.vv v8, v12, v10
+; CHECK-NEXT:    vfsub.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -164,12 +164,12 @@ define void @fsub_v6bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fsub_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsub.vv v8, v12, v10
+; CHECK-NEXT:    vfsub.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -194,12 +194,12 @@ define void @fsub_v8f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fsub_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfsub.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -224,12 +224,12 @@ define void @fsub_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fsub_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfsub.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -277,12 +277,12 @@ define void @fmul_v8bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmul.vv v8, v12, v10
+; CHECK-NEXT:    vfmul.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -298,12 +298,12 @@ define void @fmul_v6bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmul.vv v8, v12, v10
+; CHECK-NEXT:    vfmul.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -328,12 +328,12 @@ define void @fmul_v8f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fmul_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfmul.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -358,12 +358,12 @@ define void @fmul_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fmul_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfmul.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -411,12 +411,12 @@ define void @fdiv_v8bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fdiv_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfdiv.vv v8, v12, v10
+; CHECK-NEXT:    vfdiv.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -432,12 +432,12 @@ define void @fdiv_v6bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fdiv_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfdiv.vv v8, v12, v10
+; CHECK-NEXT:    vfdiv.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -462,12 +462,12 @@ define void @fdiv_v8f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fdiv_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfdiv.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -492,12 +492,12 @@ define void @fdiv_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fdiv_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfdiv.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -1299,10 +1299,10 @@ define void @sqrt_v8bf16(ptr %x) {
 ; CHECK-LABEL: sqrt_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsqrt.v v8, v10
+; CHECK-NEXT:    vfsqrt.v v8, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -1317,10 +1317,10 @@ define void @sqrt_v6bf16(ptr %x) {
 ; CHECK-LABEL: sqrt_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsqrt.v v8, v10
+; CHECK-NEXT:    vfsqrt.v v8, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -1343,10 +1343,10 @@ define void @sqrt_v8f16(ptr %x) {
 ; ZVFHMIN-LABEL: sqrt_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsqrt.v v8, v10
+; ZVFHMIN-NEXT:    vfsqrt.v v8, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -1369,10 +1369,10 @@ define void @sqrt_v6f16(ptr %x) {
 ; ZVFHMIN-LABEL: sqrt_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsqrt.v v8, v10
+; ZVFHMIN-NEXT:    vfsqrt.v v8, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -1415,17 +1415,17 @@ define void @fma_v8bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fma_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vle16.v v10, (a1)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vle16.v v14, (a1)
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v12, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = load <8 x bfloat>, ptr %y
@@ -1439,17 +1439,17 @@ define void @fma_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fma_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vle16.v v10, (a1)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vle16.v v14, (a1)
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v12, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
   %b = load <6 x bfloat>, ptr %y
@@ -1473,17 +1473,17 @@ define void @fma_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fma_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vle16.v v10, (a1)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a2)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vle16.v v14, (a1)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -1507,17 +1507,17 @@ define void @fma_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fma_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vle16.v v10, (a1)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a2)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vle16.v v14, (a1)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
@@ -1568,18 +1568,18 @@ define void @fmsub_v8bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vle16.v v14, (a1)
 ; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT:    vxor.vx v12, v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v12, v14
+; CHECK-NEXT:    vfmadd.vv v12, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = load <8 x bfloat>, ptr %y
@@ -1595,18 +1595,18 @@ define void @fmsub_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vle16.v v14, (a1)
 ; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT:    vxor.vx v12, v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v12, v14
+; CHECK-NEXT:    vfmadd.vv v12, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
   %b = load <6 x bfloat>, ptr %y
@@ -1632,18 +1632,18 @@ define void @fmsub_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
+; ZVFHMIN-NEXT:    vle16.v v14, (a1)
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
+; ZVFHMIN-NEXT:    vxor.vx v12, v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -1669,18 +1669,18 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
+; ZVFHMIN-NEXT:    vle16.v v14, (a1)
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
+; ZVFHMIN-NEXT:    vxor.vx v12, v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
@@ -1734,12 +1734,12 @@ define void @fadd_v16bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fadd_v16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v10, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vle16.v v12, (a1)
+; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v8, v16, v12
+; CHECK-NEXT:    vfadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
 ; CHECK-NEXT:    vse16.v v12, (a0)
@@ -1764,12 +1764,12 @@ define void @fadd_v16f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fadd_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vle16.v v12, (a1)
+; ZVFHMIN-NEXT:    vle16.v v16, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v16, v12
+; ZVFHMIN-NEXT:    vfadd.vv v8, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v8
 ; ZVFHMIN-NEXT:    vse16.v v12, (a0)
@@ -1817,12 +1817,12 @@ define void @fsub_v16bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fsub_v16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v10, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vle16.v v12, (a1)
+; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfsub.vv v8, v16, v12
+; CHECK-NEXT:    vfsub.vv v8, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
 ; CHECK-NEXT:    vse16.v v12, (a0)
@@ -1847,12 +1847,12 @@ define void @fsub_v16f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fsub_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vle16.v v12, (a1)
+; ZVFHMIN-NEXT:    vle16.v v16, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v8, v16, v12
+; ZVFHMIN-NEXT:    vfsub.vv v8, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v8
 ; ZVFHMIN-NEXT:    vse16.v v12, (a0)
@@ -1900,12 +1900,12 @@ define void @fmul_v16bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul_v16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v10, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vle16.v v12, (a1)
+; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfmul.vv v8, v16, v12
+; CHECK-NEXT:    vfmul.vv v8, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
 ; CHECK-NEXT:    vse16.v v12, (a0)
@@ -1930,12 +1930,12 @@ define void @fmul_v16f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fmul_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vle16.v v12, (a1)
+; ZVFHMIN-NEXT:    vle16.v v16, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v16, v12
+; ZVFHMIN-NEXT:    vfmul.vv v8, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v8
 ; ZVFHMIN-NEXT:    vse16.v v12, (a0)
@@ -1983,12 +1983,12 @@ define void @fdiv_v16bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fdiv_v16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v10, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vle16.v v12, (a1)
+; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfdiv.vv v8, v16, v12
+; CHECK-NEXT:    vfdiv.vv v8, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
 ; CHECK-NEXT:    vse16.v v12, (a0)
@@ -2013,12 +2013,12 @@ define void @fdiv_v16f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fdiv_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vle16.v v12, (a1)
+; ZVFHMIN-NEXT:    vle16.v v16, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v8, v16, v12
+; ZVFHMIN-NEXT:    vfdiv.vv v8, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v8
 ; ZVFHMIN-NEXT:    vse16.v v12, (a0)
@@ -2132,17 +2132,17 @@ define void @fma_v16bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fma_v16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    vle16.v v10, (a0)
-; CHECK-NEXT:    vle16.v v12, (a1)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
+; CHECK-NEXT:    vle16.v v12, (a2)
+; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    vle16.v v20, (a1)
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v20
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v20, v16
+; CHECK-NEXT:    vfmadd.vv v16, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
-; CHECK-NEXT:    vse16.v v12, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <16 x bfloat>, ptr %x
   %b = load <16 x bfloat>, ptr %y
@@ -2166,17 +2166,17 @@ define void @fma_v16f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fma_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vle16.v v12, (a1)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vle16.v v12, (a2)
+; ZVFHMIN-NEXT:    vle16.v v16, (a0)
+; ZVFHMIN-NEXT:    vle16.v v20, (a1)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v8
-; ZVFHMIN-NEXT:    vse16.v v12, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <16 x half>, ptr %x
   %b = load <16 x half>, ptr %y
@@ -2226,13 +2226,13 @@ define void @fadd_vf_v8bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fadd_vf_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v8, v10, v12
+; CHECK-NEXT:    vfadd.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -2249,13 +2249,13 @@ define void @fadd_vf_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fadd_vf_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v8, v10, v12
+; CHECK-NEXT:    vfadd.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -2280,13 +2280,13 @@ define void @fadd_vf_v8f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fadd_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v12
+; ZVFHMIN-NEXT:    vfadd.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -2311,13 +2311,13 @@ define void @fadd_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fadd_vf_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v12
+; ZVFHMIN-NEXT:    vfadd.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -2366,13 +2366,13 @@ define void @fadd_fv_v8bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fadd_fv_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v8, v12, v10
+; CHECK-NEXT:    vfadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -2389,13 +2389,13 @@ define void @fadd_fv_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fadd_fv_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v8, v12, v10
+; CHECK-NEXT:    vfadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -2420,13 +2420,13 @@ define void @fadd_fv_v8f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fadd_fv_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -2451,13 +2451,13 @@ define void @fadd_fv_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fadd_fv_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -2506,13 +2506,13 @@ define void @fsub_vf_v8bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fsub_vf_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsub.vv v8, v10, v12
+; CHECK-NEXT:    vfsub.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -2529,13 +2529,13 @@ define void @fsub_vf_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fsub_vf_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsub.vv v8, v10, v12
+; CHECK-NEXT:    vfsub.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -2560,13 +2560,13 @@ define void @fsub_vf_v8f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fsub_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v8, v10, v12
+; ZVFHMIN-NEXT:    vfsub.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -2591,13 +2591,13 @@ define void @fsub_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fsub_vf_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v8, v10, v12
+; ZVFHMIN-NEXT:    vfsub.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -2646,13 +2646,13 @@ define void @fsub_fv_v8bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fsub_fv_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsub.vv v8, v12, v10
+; CHECK-NEXT:    vfsub.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -2669,13 +2669,13 @@ define void @fsub_fv_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fsub_fv_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsub.vv v8, v12, v10
+; CHECK-NEXT:    vfsub.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -2700,13 +2700,13 @@ define void @fsub_fv_v8f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fsub_fv_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfsub.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -2731,13 +2731,13 @@ define void @fsub_fv_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fsub_fv_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfsub.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -2786,13 +2786,13 @@ define void @fmul_vf_v8bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fmul_vf_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmul.vv v8, v10, v12
+; CHECK-NEXT:    vfmul.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -2809,13 +2809,13 @@ define void @fmul_vf_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fmul_vf_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmul.vv v8, v10, v12
+; CHECK-NEXT:    vfmul.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -2840,13 +2840,13 @@ define void @fmul_vf_v8f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fmul_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v10, v12
+; ZVFHMIN-NEXT:    vfmul.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -2871,13 +2871,13 @@ define void @fmul_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fmul_vf_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v10, v12
+; ZVFHMIN-NEXT:    vfmul.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -2926,13 +2926,13 @@ define void @fmul_fv_v8bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fmul_fv_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmul.vv v8, v12, v10
+; CHECK-NEXT:    vfmul.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -2949,13 +2949,13 @@ define void @fmul_fv_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fmul_fv_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmul.vv v8, v12, v10
+; CHECK-NEXT:    vfmul.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -2980,13 +2980,13 @@ define void @fmul_fv_v8f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fmul_fv_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfmul.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -3011,13 +3011,13 @@ define void @fmul_fv_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fmul_fv_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfmul.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -3066,13 +3066,13 @@ define void @fdiv_vf_v8bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fdiv_vf_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfdiv.vv v8, v10, v12
+; CHECK-NEXT:    vfdiv.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -3089,13 +3089,13 @@ define void @fdiv_vf_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fdiv_vf_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfdiv.vv v8, v10, v12
+; CHECK-NEXT:    vfdiv.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -3120,13 +3120,13 @@ define void @fdiv_vf_v8f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fdiv_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v8, v10, v12
+; ZVFHMIN-NEXT:    vfdiv.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -3151,13 +3151,13 @@ define void @fdiv_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fdiv_vf_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v8, v10, v12
+; ZVFHMIN-NEXT:    vfdiv.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -3206,13 +3206,13 @@ define void @fdiv_fv_v8bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fdiv_fv_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfdiv.vv v8, v12, v10
+; CHECK-NEXT:    vfdiv.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -3229,13 +3229,13 @@ define void @fdiv_fv_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-LABEL: fdiv_fv_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfdiv.vv v8, v12, v10
+; CHECK-NEXT:    vfdiv.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -3260,13 +3260,13 @@ define void @fdiv_fv_v8f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fdiv_fv_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfdiv.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -3291,13 +3291,13 @@ define void @fdiv_fv_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-LABEL: fdiv_fv_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v8, v12, v10
+; ZVFHMIN-NEXT:    vfdiv.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -3346,18 +3346,18 @@ define void @fma_vf_v8bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-LABEL: fma_vf_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vmv.v.x v14, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v12, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = load <8 x bfloat>, ptr %y
@@ -3372,18 +3372,18 @@ define void @fma_vf_v6bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-LABEL: fma_vf_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vmv.v.x v14, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v12, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
   %b = load <6 x bfloat>, ptr %y
@@ -3407,18 +3407,18 @@ define void @fma_vf_v8f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-LABEL: fma_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -3442,18 +3442,18 @@ define void @fma_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-LABEL: fma_vf_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
@@ -3504,18 +3504,18 @@ define void @fma_fv_v8bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-LABEL: fma_fv_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vmv.v.x v14, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v12, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = load <8 x bfloat>, ptr %y
@@ -3530,18 +3530,18 @@ define void @fma_fv_v6bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-LABEL: fma_fv_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
 ; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vmv.v.x v14, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v12, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
   %b = load <6 x bfloat>, ptr %y
@@ -3565,18 +3565,18 @@ define void @fma_fv_v8f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-LABEL: fma_fv_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -3600,18 +3600,18 @@ define void @fma_fv_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-LABEL: fma_fv_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
@@ -3664,18 +3664,18 @@ define void @fmsub_vf_v8bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-NEXT:    fmv.x.w a2, fa0
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    vmv.v.x v10, a2
-; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT:    vmv.v.x v14, a2
+; CHECK-NEXT:    vxor.vx v12, v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v12, v14
+; CHECK-NEXT:    vfmadd.vv v12, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = load <8 x bfloat>, ptr %y
@@ -3693,18 +3693,18 @@ define void @fmsub_vf_v6bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-NEXT:    fmv.x.w a2, fa0
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    vmv.v.x v10, a2
-; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT:    vmv.v.x v14, a2
+; CHECK-NEXT:    vxor.vx v12, v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v12, v14
+; CHECK-NEXT:    vfmadd.vv v12, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
   %b = load <6 x bfloat>, ptr %y
@@ -3731,18 +3731,18 @@ define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-NEXT:    fmv.x.w a2, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vmv.v.x v10, a2
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
+; ZVFHMIN-NEXT:    vmv.v.x v14, a2
+; ZVFHMIN-NEXT:    vxor.vx v12, v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -3769,18 +3769,18 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-NEXT:    fmv.x.w a2, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vmv.v.x v10, a2
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
+; ZVFHMIN-NEXT:    vmv.v.x v14, a2
+; ZVFHMIN-NEXT:    vxor.vx v12, v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
@@ -3874,20 +3874,20 @@ define void @trunc_v8bf16(ptr %x) {
 ; CHECK-LABEL: trunc_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = call <8 x bfloat> @llvm.trunc.v8bf16(<8 x bfloat> %a)
@@ -3899,20 +3899,20 @@ define void @trunc_v6bf16(ptr %x) {
 ; CHECK-LABEL: trunc_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
   %b = call <6 x bfloat> @llvm.trunc.v6bf16(<6 x bfloat> %a)
@@ -3939,20 +3939,20 @@ define void @trunc_v8f16(ptr %x) {
 ; ZVFHMIN-LABEL: trunc_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v10, v8
+; ZVFHMIN-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vse16.v v10, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = call <8 x half> @llvm.trunc.v8f16(<8 x half> %a)
@@ -3979,20 +3979,20 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-LABEL: trunc_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v10, v8
+; ZVFHMIN-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vse16.v v10, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = call <6 x half> @llvm.trunc.v6f16(<6 x half> %a)
@@ -4046,22 +4046,22 @@ define void @ceil_v8bf16(ptr %x) {
 ; CHECK-LABEL: ceil_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vfabs.v v10, v8
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a1, 3
-; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = call <8 x bfloat> @llvm.ceil.v8bf16(<8 x bfloat> %a)
@@ -4073,22 +4073,22 @@ define void @ceil_v6bf16(ptr %x) {
 ; CHECK-LABEL: ceil_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vfabs.v v10, v8
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a1, 3
-; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
   %b = call <6 x bfloat> @llvm.ceil.v6bf16(<6 x bfloat> %a)
@@ -4117,22 +4117,22 @@ define void @ceil_v8f16(ptr %x) {
 ; ZVFHMIN-LABEL: ceil_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vfabs.v v10, v8
+; ZVFHMIN-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 3
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vse16.v v10, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = call <8 x half> @llvm.ceil.v8f16(<8 x half> %a)
@@ -4161,22 +4161,22 @@ define void @ceil_v6f16(ptr %x) {
 ; ZVFHMIN-LABEL: ceil_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vfabs.v v10, v8
+; ZVFHMIN-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 3
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vse16.v v10, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = call <6 x half> @llvm.ceil.v6f16(<6 x half> %a)
@@ -4234,22 +4234,22 @@ define void @floor_v8bf16(ptr %x) {
 ; CHECK-LABEL: floor_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vfabs.v v10, v8
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a1, 2
-; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = call <8 x bfloat> @llvm.floor.v8bf16(<8 x bfloat> %a)
@@ -4261,22 +4261,22 @@ define void @floor_v6bf16(ptr %x) {
 ; CHECK-LABEL: floor_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vfabs.v v10, v8
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a1, 2
-; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
   %b = call <6 x bfloat> @llvm.floor.v6bf16(<6 x bfloat> %a)
@@ -4305,22 +4305,22 @@ define void @floor_v8f16(ptr %x) {
 ; ZVFHMIN-LABEL: floor_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vfabs.v v10, v8
+; ZVFHMIN-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 2
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vse16.v v10, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = call <8 x half> @llvm.floor.v8f16(<8 x half> %a)
@@ -4349,22 +4349,22 @@ define void @floor_v6f16(ptr %x) {
 ; ZVFHMIN-LABEL: floor_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vfabs.v v10, v8
+; ZVFHMIN-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 2
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vse16.v v10, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = call <6 x half> @llvm.floor.v6f16(<6 x half> %a)
@@ -4422,22 +4422,22 @@ define void @round_v8bf16(ptr %x) {
 ; CHECK-LABEL: round_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vfabs.v v10, v8
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a1, 4
-; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = call <8 x bfloat> @llvm.round.v8bf16(<8 x bfloat> %a)
@@ -4449,22 +4449,22 @@ define void @round_v6bf16(ptr %x) {
 ; CHECK-LABEL: round_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vfabs.v v10, v8
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a1, 4
-; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
   %b = call <6 x bfloat> @llvm.round.v6bf16(<6 x bfloat> %a)
@@ -4493,22 +4493,22 @@ define void @round_v8f16(ptr %x) {
 ; ZVFHMIN-LABEL: round_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vfabs.v v10, v8
+; ZVFHMIN-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 4
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vse16.v v10, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = call <8 x half> @llvm.round.v8f16(<8 x half> %a)
@@ -4537,22 +4537,22 @@ define void @round_v6f16(ptr %x) {
 ; ZVFHMIN-LABEL: round_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vfabs.v v10, v8
+; ZVFHMIN-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 4
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vse16.v v10, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = call <6 x half> @llvm.round.v6f16(<6 x half> %a)
@@ -4610,20 +4610,20 @@ define void @rint_v8bf16(ptr %x) {
 ; CHECK-LABEL: rint_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = call <8 x bfloat> @llvm.rint.v8bf16(<8 x bfloat> %a)
@@ -4650,20 +4650,20 @@ define void @rint_v8f16(ptr %x) {
 ; ZVFHMIN-LABEL: rint_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v10, v8
+; ZVFHMIN-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vse16.v v10, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = call <8 x half> @llvm.rint.v8f16(<8 x half> %a)
@@ -4717,22 +4717,22 @@ define void @nearbyint_v8bf16(ptr %x) {
 ; CHECK-LABEL: nearbyint_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vfabs.v v10, v8
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a1
-; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    fsflags a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = call <8 x bfloat> @llvm.nearbyint.v8bf16(<8 x bfloat> %a)
@@ -4761,22 +4761,22 @@ define void @nearbyint_v8f16(ptr %x) {
 ; ZVFHMIN-LABEL: nearbyint_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vfabs.v v10, v8
+; ZVFHMIN-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFHMIN-NEXT:    frflags a1
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    fsflags a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vse16.v v10, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %a)
@@ -4834,19 +4834,19 @@ define void @fmuladd_v8bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fmuladd_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vle16.v v10, (a2)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vle16.v v13, (a2)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmul.vv v8, v14, v12
+; CHECK-NEXT:    vfmul.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v11, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v11
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v13
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v8, v8, v12
+; CHECK-NEXT:    vfadd.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -4863,19 +4863,19 @@ define void @fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fmuladd_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vle16.v v10, (a2)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vle16.v v13, (a2)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmul.vv v8, v14, v12
+; CHECK-NEXT:    vfmul.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v11, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v11
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v13
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v8, v8, v12
+; CHECK-NEXT:    vfadd.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -4902,19 +4902,19 @@ define void @fmuladd_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fmuladd_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vle16.v v10, (a2)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vle16.v v13, (a2)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmul.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v13
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v8, v12
+; ZVFHMIN-NEXT:    vfadd.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -4941,19 +4941,19 @@ define void @fmuladd_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fmuladd_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vle16.v v10, (a2)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vle16.v v13, (a2)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmul.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v13
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v8, v12
+; ZVFHMIN-NEXT:    vfadd.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -5006,19 +5006,19 @@ define void @fmsub_fmuladd_v8bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fmsub_fmuladd_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vle16.v v10, (a2)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vle16.v v13, (a2)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmul.vv v8, v14, v12
+; CHECK-NEXT:    vfmul.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v11, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v11
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v13
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsub.vv v8, v8, v12
+; CHECK-NEXT:    vfsub.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -5036,19 +5036,19 @@ define void @fmsub_fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fmsub_fmuladd_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vle16.v v10, (a2)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vle16.v v13, (a2)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmul.vv v8, v14, v12
+; CHECK-NEXT:    vfmul.vv v8, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v11, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v11
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v13
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsub.vv v8, v8, v12
+; CHECK-NEXT:    vfsub.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -5076,19 +5076,19 @@ define void @fmsub_fmuladd_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fmsub_fmuladd_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vle16.v v10, (a2)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vle16.v v13, (a2)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmul.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v13
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v8, v8, v12
+; ZVFHMIN-NEXT:    vfsub.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -5116,19 +5116,19 @@ define void @fmsub_fmuladd_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fmsub_fmuladd_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vle16.v v10, (a2)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a0)
+; ZVFHMIN-NEXT:    vle16.v v13, (a2)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmul.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v13
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v8, v8, v12
+; ZVFHMIN-NEXT:    vfsub.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
index f0e6df6298471..1f9a8bf8133c8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
@@ -113,11 +113,11 @@ define void @fp2si_v8f32_v8i64(ptr %x, ptr %y) {
 ; CHECK-LABEL: fp2si_v8f32_v8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v12, v8
+; CHECK-NEXT:    vle32.v v12, (a0)
+; CHECK-NEXT:    vmfne.vv v0, v12, v12
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <8 x float>, ptr %x
@@ -132,11 +132,11 @@ define void @fp2ui_v8f32_v8i64(ptr %x, ptr %y) {
 ; CHECK-LABEL: fp2ui_v8f32_v8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
+; CHECK-NEXT:    vle32.v v12, (a0)
+; CHECK-NEXT:    vmfne.vv v0, v12, v12
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <8 x float>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
index ddcb3c3121bc3..9be93d5209121 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
@@ -400,9 +400,9 @@ define void @fp2si_v8f32_v8i64(ptr %x, ptr %y) {
 ; CHECK-LABEL: fp2si_v8f32_v8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT:    vse64.v v12, (a1)
+; CHECK-NEXT:    vle32.v v12, (a0)
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v12
+; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <8 x float>, ptr %x
   %d = fptosi <8 x float> %a to <8 x i64>
@@ -414,9 +414,9 @@ define void @fp2ui_v8f32_v8i64(ptr %x, ptr %y) {
 ; CHECK-LABEL: fp2ui_v8f32_v8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
-; CHECK-NEXT:    vse64.v v12, (a1)
+; CHECK-NEXT:    vle32.v v12, (a0)
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
+; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <8 x float>, ptr %x
   %d = fptoui <8 x float> %a to <8 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
index a1466d46f1ba7..c29f4edb8c6a3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
@@ -84,8 +84,8 @@ define <15 x double> @vfpext_v15f32_v15f64(<15 x float> %a, <15 x i1> %m, i32 ze
 ; CHECK-LABEL: vfpext_v15f32_v15f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.fpext.v15f64.v15f32(<15 x float> %a, <15 x i1> %m, i32 %vl)
   ret <15 x double> %v
@@ -98,24 +98,25 @@ define <32 x double> @vfpext_v32f32_v32f64(<32 x float> %a, <32 x i1> %m, i32 ze
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB7_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.fpext.v32f64.v32f32(<32 x float> %a, <32 x i1> %m, i32 %vl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
index f6c992280c6e3..cbc4c69669b51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
@@ -269,8 +269,8 @@ define <4 x i64> @vfptosi_v4i64_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-LABEL: vfptosi_v4i64_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
@@ -280,8 +280,8 @@ define <4 x i64> @vfptosi_v4i64_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl
 ; CHECK-LABEL: vfptosi_v4i64_v4f32_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
index af225f4d95aa2..5dd3e0372f401 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
@@ -269,8 +269,8 @@ define <4 x i64> @vfptoui_v4i64_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-LABEL: vfptoui_v4i64_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
@@ -280,8 +280,8 @@ define <4 x i64> @vfptoui_v4i64_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl
 ; CHECK-LABEL: vfptoui_v4i64_v4f32_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
index 29f8730021ce4..b52b082282ec4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
@@ -369,9 +369,9 @@ define void @si2fp_v8i16_v8f64(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsext.vf2 v10, v8
-; CHECK-NEXT:    vfwcvt.f.x.v v12, v10
-; CHECK-NEXT:    vse64.v v12, (a1)
+; CHECK-NEXT:    vsext.vf2 v12, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
+; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <8 x i16>, ptr %x
   %d = sitofp <8 x i16> %a to <8 x double>
@@ -384,9 +384,9 @@ define void @ui2fp_v8i16_v8f64(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-NEXT:    vse64.v v12, (a1)
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <8 x i16>, ptr %x
   %d = uitofp <8 x i16> %a to <8 x double>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index 62e7e3b109902..5747bbb3f100c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -269,11 +269,11 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLS-LABEL: insert_v8i32_v2i32_0:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLS-NEXT:    vle32.v v8, (a1)
-; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vle32.v v10, (a1)
+; VLS-NEXT:    vl2re32.v v8, (a0)
 ; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; VLS-NEXT:    vmv.v.v v10, v8
-; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    vmv.v.v v8, v10
+; VLS-NEXT:    vs2r.v v8, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
@@ -298,11 +298,11 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
 ; VLS-LABEL: insert_v8i32_v2i32_2:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLS-NEXT:    vle32.v v8, (a1)
-; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vle32.v v10, (a1)
+; VLS-NEXT:    vl2re32.v v8, (a0)
 ; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; VLS-NEXT:    vslideup.vi v10, v8, 2
-; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    vslideup.vi v8, v10, 2
+; VLS-NEXT:    vs2r.v v8, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
@@ -326,11 +326,11 @@ define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
 ; VLS-LABEL: insert_v8i32_v2i32_6:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLS-NEXT:    vle32.v v8, (a1)
-; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vle32.v v10, (a1)
+; VLS-NEXT:    vl2re32.v v8, (a0)
 ; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; VLS-NEXT:    vslideup.vi v11, v8, 2
-; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    vslideup.vi v9, v10, 2
+; VLS-NEXT:    vs2r.v v8, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index 16bb2105f8680..a91263e85e9e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -974,86 +974,86 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v0, 15
-; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmv.v.i v12, 0
 ; RV32-NEXT:    li a0, 512
 ; RV32-NEXT:    li a1, 240
-; RV32-NEXT:    vmerge.vim v9, v8, -1, v0
+; RV32-NEXT:    vmerge.vim v13, v12, -1, v0
 ; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v12, 3
-; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vmerge.vim v12, v12, 0, v0
+; RV32-NEXT:    vmv.v.i v8, 3
+; RV32-NEXT:    vmv1r.v v0, v13
+; RV32-NEXT:    vmerge.vim v8, v8, 0, v0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    li a1, 15
 ; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
-; RV32-NEXT:    vmerge.vim v9, v8, -1, v0
+; RV32-NEXT:    vmerge.vim v13, v12, -1, v0
 ; RV32-NEXT:    slli a1, a1, 8
-; RV32-NEXT:    vmv1r.v v0, v9
+; RV32-NEXT:    vmv1r.v v0, v13
 ; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV32-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
-; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV32-NEXT:    vmerge.vim v8, v12, 2, v0
+; RV32-NEXT:    vmerge.vim v8, v8, 2, v0
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: buildvec_not_vid_v512i8_indices_overflow_2:
 ; RV64V:       # %bb.0:
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.i v0, 3
-; RV64V-NEXT:    vmv.v.i v8, 0
+; RV64V-NEXT:    vmv.v.i v12, 0
 ; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmerge.vim v9, v8, -1, v0
+; RV64V-NEXT:    vmerge.vim v13, v12, -1, v0
 ; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64V-NEXT:    vmv.v.i v12, 3
-; RV64V-NEXT:    vmv1r.v v0, v9
-; RV64V-NEXT:    vmerge.vim v12, v12, 0, v0
+; RV64V-NEXT:    vmv.v.i v8, 3
+; RV64V-NEXT:    vmv1r.v v0, v13
+; RV64V-NEXT:    vmerge.vim v8, v8, 0, v0
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.i v0, 12
-; RV64V-NEXT:    vmerge.vim v9, v8, -1, v0
+; RV64V-NEXT:    vmerge.vim v13, v12, -1, v0
 ; RV64V-NEXT:    li a1, 48
-; RV64V-NEXT:    vmv.v.v v0, v9
+; RV64V-NEXT:    vmv.v.v v0, v13
 ; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64V-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV64V-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64V-NEXT:    vmv.s.x v0, a1
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
-; RV64V-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV64V-NEXT:    vmv.v.v v0, v8
+; RV64V-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64V-NEXT:    vmv.v.v v0, v12
 ; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64V-NEXT:    vmerge.vim v8, v12, 2, v0
+; RV64V-NEXT:    vmerge.vim v8, v8, 2, v0
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32-LABEL: buildvec_not_vid_v512i8_indices_overflow_2:
 ; RV64ZVE32:       # %bb.0:
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32-NEXT:    vmv.v.i v8, 0
+; RV64ZVE32-NEXT:    vmv.v.i v12, 0
 ; RV64ZVE32-NEXT:    li a0, 512
 ; RV64ZVE32-NEXT:    li a1, 240
-; RV64ZVE32-NEXT:    vmerge.vim v9, v8, -1, v0
+; RV64ZVE32-NEXT:    vmerge.vim v13, v12, -1, v0
 ; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64ZVE32-NEXT:    vmv.v.i v12, 3
-; RV64ZVE32-NEXT:    vmv1r.v v0, v9
-; RV64ZVE32-NEXT:    vmerge.vim v12, v12, 0, v0
+; RV64ZVE32-NEXT:    vmv.v.i v8, 3
+; RV64ZVE32-NEXT:    vmv1r.v v0, v13
+; RV64ZVE32-NEXT:    vmerge.vim v8, v8, 0, v0
 ; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; RV64ZVE32-NEXT:    vmv.s.x v0, a1
 ; RV64ZVE32-NEXT:    li a1, 15
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
-; RV64ZVE32-NEXT:    vmerge.vim v9, v8, -1, v0
+; RV64ZVE32-NEXT:    vmerge.vim v13, v12, -1, v0
 ; RV64ZVE32-NEXT:    slli a1, a1, 8
-; RV64ZVE32-NEXT:    vmv.v.v v0, v9
+; RV64ZVE32-NEXT:    vmv.v.v v0, v13
 ; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64ZVE32-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV64ZVE32-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; RV64ZVE32-NEXT:    vmv.s.x v0, a1
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
-; RV64ZVE32-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV64ZVE32-NEXT:    vmv.v.v v0, v8
+; RV64ZVE32-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64ZVE32-NEXT:    vmv.v.v v0, v12
 ; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64ZVE32-NEXT:    vmerge.vim v8, v12, 2, v0
+; RV64ZVE32-NEXT:    vmerge.vim v8, v8, 2, v0
 ; RV64ZVE32-NEXT:    ret
   ret <512 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll
index c65f6e5fa7866..970f4b04e919a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll
@@ -34,9 +34,9 @@ define void @sext_v8i8_v8i32(ptr %x, ptr %z) {
 ; CHECK-LABEL: sext_v8i8_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsext.vf4 v10, v8
-; CHECK-NEXT:    vse32.v v10, (a1)
+; CHECK-NEXT:    vle8.v v10, (a0)
+; CHECK-NEXT:    vsext.vf4 v8, v10
+; CHECK-NEXT:    vse32.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <8 x i8>, ptr %x
   %b = sext <8 x i8> %a to <8 x i32>
@@ -49,9 +49,9 @@ define void @sext_v32i8_v32i32(ptr %x, ptr %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsext.vf4 v16, v8
-; CHECK-NEXT:    vse32.v v16, (a1)
+; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    vsext.vf4 v8, v16
+; CHECK-NEXT:    vse32.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <32 x i8>, ptr %x
   %b = sext <32 x i8> %a to <32 x i32>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index 4911c340c9154..fc59bb0589a09 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -126,10 +126,11 @@ define <8 x i32> @interleave_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; V128-LABEL: interleave_v4i32:
 ; V128:       # %bb.0:
 ; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; V128-NEXT:    vwaddu.vv v10, v8, v9
+; V128-NEXT:    vmv1r.v v10, v9
+; V128-NEXT:    vmv1r.v v11, v8
+; V128-NEXT:    vwaddu.vv v8, v11, v10
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v10, a0, v9
-; V128-NEXT:    vmv2r.v v8, v10
+; V128-NEXT:    vwmaccu.vx v8, a0, v10
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v4i32:
@@ -228,10 +229,11 @@ define <16 x i16> @interleave_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; V128-LABEL: interleave_v8i16:
 ; V128:       # %bb.0:
 ; V128-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; V128-NEXT:    vwaddu.vv v10, v9, v8
+; V128-NEXT:    vmv1r.v v10, v9
+; V128-NEXT:    vmv1r.v v11, v8
+; V128-NEXT:    vwaddu.vv v8, v10, v11
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v10, a0, v8
-; V128-NEXT:    vmv2r.v v8, v10
+; V128-NEXT:    vwmaccu.vx v8, a0, v11
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v8i16:
@@ -250,10 +252,11 @@ define <16 x i32> @interleave_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; V128-LABEL: interleave_v8i32:
 ; V128:       # %bb.0:
 ; V128-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; V128-NEXT:    vwaddu.vv v12, v8, v10
+; V128-NEXT:    vmv2r.v v12, v10
+; V128-NEXT:    vmv2r.v v14, v8
+; V128-NEXT:    vwaddu.vv v8, v14, v12
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v12, a0, v10
-; V128-NEXT:    vmv4r.v v8, v12
+; V128-NEXT:    vwmaccu.vx v8, a0, v12
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v8i32:
@@ -272,10 +275,11 @@ define <32 x i8> @interleave_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; V128-LABEL: interleave_v16i8:
 ; V128:       # %bb.0:
 ; V128-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; V128-NEXT:    vwaddu.vv v10, v8, v9
+; V128-NEXT:    vmv1r.v v10, v9
+; V128-NEXT:    vmv1r.v v11, v8
+; V128-NEXT:    vwaddu.vv v8, v11, v10
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v10, a0, v9
-; V128-NEXT:    vmv2r.v v8, v10
+; V128-NEXT:    vwmaccu.vx v8, a0, v10
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v16i8:
@@ -294,10 +298,11 @@ define <32 x i16> @interleave_v16i16(<16 x i16> %x, <16 x i16> %y) {
 ; V128-LABEL: interleave_v16i16:
 ; V128:       # %bb.0:
 ; V128-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; V128-NEXT:    vwaddu.vv v12, v8, v10
+; V128-NEXT:    vmv2r.v v12, v10
+; V128-NEXT:    vmv2r.v v14, v8
+; V128-NEXT:    vwaddu.vv v8, v14, v12
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v12, a0, v10
-; V128-NEXT:    vmv4r.v v8, v12
+; V128-NEXT:    vwmaccu.vx v8, a0, v12
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v16i16:
@@ -316,19 +321,21 @@ define <32 x i32> @interleave_v16i32(<16 x i32> %x, <16 x i32> %y) {
 ; V128-LABEL: interleave_v16i32:
 ; V128:       # %bb.0:
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v16, v8, v12
+; V128-NEXT:    vmv4r.v v16, v12
+; V128-NEXT:    vmv4r.v v20, v8
+; V128-NEXT:    vwaddu.vv v8, v20, v16
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v16, a0, v12
-; V128-NEXT:    vmv8r.v v8, v16
+; V128-NEXT:    vwmaccu.vx v8, a0, v16
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v16i32:
 ; V512:       # %bb.0:
 ; V512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
-; V512-NEXT:    vwaddu.vv v10, v8, v9
+; V512-NEXT:    vmv1r.v v10, v9
+; V512-NEXT:    vmv1r.v v11, v8
+; V512-NEXT:    vwaddu.vv v8, v11, v10
 ; V512-NEXT:    li a0, -1
-; V512-NEXT:    vwmaccu.vx v10, a0, v9
-; V512-NEXT:    vmv2r.v v8, v10
+; V512-NEXT:    vwmaccu.vx v8, a0, v10
 ; V512-NEXT:    ret
   %a = shufflevector <16 x i32> %x, <16 x i32> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   ret <32 x i32> %a
@@ -337,12 +344,14 @@ define <32 x i32> @interleave_v16i32(<16 x i32> %x, <16 x i32> %y) {
 define <64 x i8> @interleave_v32i8(<32 x i8> %x, <32 x i8> %y) {
 ; V128-LABEL: interleave_v32i8:
 ; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; V128-NEXT:    vmv2r.v v12, v10
+; V128-NEXT:    vmv2r.v v14, v8
 ; V128-NEXT:    li a0, 32
 ; V128-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; V128-NEXT:    vwaddu.vv v12, v8, v10
+; V128-NEXT:    vwaddu.vv v8, v14, v12
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v12, a0, v10
-; V128-NEXT:    vmv4r.v v8, v12
+; V128-NEXT:    vwmaccu.vx v8, a0, v12
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v32i8:
@@ -361,22 +370,26 @@ define <64 x i8> @interleave_v32i8(<32 x i8> %x, <32 x i8> %y) {
 define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
 ; V128-LABEL: interleave_v32i16:
 ; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; V128-NEXT:    vmv4r.v v16, v12
+; V128-NEXT:    vmv4r.v v20, v8
 ; V128-NEXT:    li a0, 32
 ; V128-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v16, v8, v12
+; V128-NEXT:    vwaddu.vv v8, v20, v16
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v16, a0, v12
-; V128-NEXT:    vmv8r.v v8, v16
+; V128-NEXT:    vwmaccu.vx v8, a0, v16
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v32i16:
 ; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; V512-NEXT:    vmv1r.v v10, v9
+; V512-NEXT:    vmv1r.v v11, v8
 ; V512-NEXT:    li a0, 32
 ; V512-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; V512-NEXT:    vwaddu.vv v10, v8, v9
+; V512-NEXT:    vwaddu.vv v8, v11, v10
 ; V512-NEXT:    li a0, -1
-; V512-NEXT:    vwmaccu.vx v10, a0, v9
-; V512-NEXT:    vmv2r.v v8, v10
+; V512-NEXT:    vwmaccu.vx v8, a0, v10
 ; V512-NEXT:    ret
   %a = shufflevector <32 x i16> %x, <32 x i16> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   ret <64 x i16> %a
@@ -424,12 +437,14 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
 ;
 ; V512-LABEL: interleave_v32i32:
 ; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; V512-NEXT:    vmv2r.v v12, v10
+; V512-NEXT:    vmv2r.v v14, v8
 ; V512-NEXT:    li a0, 32
 ; V512-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; V512-NEXT:    vwaddu.vv v12, v8, v10
+; V512-NEXT:    vwaddu.vv v8, v14, v12
 ; V512-NEXT:    li a0, -1
-; V512-NEXT:    vwmaccu.vx v12, a0, v10
-; V512-NEXT:    vmv4r.v v8, v12
+; V512-NEXT:    vwmaccu.vx v8, a0, v12
 ; V512-NEXT:    ret
   %a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   ret <64 x i32> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index cd73dbadb2d03..caf9c976b8c76 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -1063,15 +1063,15 @@ define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) {
 ; CHECK-NEXT:    lui a0, %hi(.LCPI74_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI74_0)
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    vle8.v v18, (a0)
 ; CHECK-NEXT:    lui a0, 11
 ; CHECK-NEXT:    addi a0, a0, -1366
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vmerge.vvm v12, v12, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v18, v16
+; CHECK-NEXT:    vsext.vf2 v16, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v18
+; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16
 ; CHECK-NEXT:    ret
   %out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
   ret <16 x i32> %out
@@ -1172,9 +1172,9 @@ define <4 x i128> @shuffle_i128(<4 x i128> %a) {
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vle16.v v12, (a1)
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v12
-; RV32-NEXT:    vse64.v v16, (a0)
+; RV32-NEXT:    vle16.v v16, (a1)
+; RV32-NEXT:    vrgatherei16.vv v12, v8, v16
+; RV32-NEXT:    vse64.v v12, (a0)
 ; RV32-NEXT:    addi sp, s0, -128
 ; RV32-NEXT:    .cfi_def_cfa sp, 128
 ; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
@@ -1213,9 +1213,9 @@ define <4 x i128> @shuffle_i128(<4 x i128> %a) {
 ; RV64-NEXT:    addi a1, a1, %lo(.LCPI78_0)
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a2)
-; RV64-NEXT:    vle16.v v12, (a1)
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v12
-; RV64-NEXT:    vse64.v v16, (a0)
+; RV64-NEXT:    vle16.v v16, (a1)
+; RV64-NEXT:    vrgatherei16.vv v12, v8, v16
+; RV64-NEXT:    vse64.v v12, (a0)
 ; RV64-NEXT:    addi sp, s0, -128
 ; RV64-NEXT:    .cfi_def_cfa sp, 128
 ; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
@@ -1236,9 +1236,9 @@ define void @shuffle_i128_ldst(ptr %p) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI79_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI79_0)
-; CHECK-NEXT:    vle16.v v12, (a1)
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vse64.v v16, (a0)
+; CHECK-NEXT:    vle16.v v16, (a1)
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
+; CHECK-NEXT:    vse64.v v12, (a0)
 ; CHECK-NEXT:    ret
   %a = load <4 x i128>, ptr %p
   %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
@@ -1252,12 +1252,12 @@ define void @shuffle_i256_ldst(ptr %p) {
 ; CHECK-NEXT:    lui a1, %hi(.LCPI80_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI80_0)
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vsext.vf2 v10, v8
+; CHECK-NEXT:    vle8.v v16, (a1)
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vsext.vf2 v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v16, v10
-; CHECK-NEXT:    vse64.v v24, (a0)
+; CHECK-NEXT:    vrgatherei16.vv v16, v8, v24
+; CHECK-NEXT:    vse64.v v16, (a0)
 ; CHECK-NEXT:    ret
   %a = load <4 x i256>, ptr %p
   %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
@@ -1318,10 +1318,10 @@ define void @shuffle_i256_splat(ptr %p) nounwind {
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v18, v16
+; RV32-NEXT:    vsext.vf2 v24, v16
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v24, v8, v18
-; RV32-NEXT:    vse64.v v24, (a0)
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v24
+; RV32-NEXT:    vse64.v v16, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_i256_splat:
@@ -1333,10 +1333,10 @@ define void @shuffle_i256_splat(ptr %p) nounwind {
 ; RV64-NEXT:    addi a1, a1, 1
 ; RV64-NEXT:    slli a1, a1, 16
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    vmv.v.x v24, a1
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v24, v8, v16
-; RV64-NEXT:    vse64.v v24, (a0)
+; RV64-NEXT:    vrgatherei16.vv v16, v8, v24
+; RV64-NEXT:    vse64.v v16, (a0)
 ; RV64-NEXT:    ret
   %a = load <4 x i256>, ptr %p
   %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 0436a27409f81..3bb5e179e0d06 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -3225,26 +3225,26 @@ define void @mulhu_v16i16(ptr %x) {
 ; RV32-NEXT:    addi a1, a1, 289
 ; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV32-NEXT:    vmerge.vim v16, v9, 1, v0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    lui a1, %hi(.LCPI182_0)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI182_0)
 ; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v14, 3
-; RV32-NEXT:    vmerge.vim v14, v14, 2, v0
-; RV32-NEXT:    vle16.v v16, (a1)
+; RV32-NEXT:    vmv.v.i v9, 3
+; RV32-NEXT:    vmerge.vim v9, v9, 2, v0
+; RV32-NEXT:    vle16.v v14, (a1)
 ; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v14, 1, v0
+; RV32-NEXT:    vmerge.vim v17, v9, 1, v0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v14, v9
-; RV32-NEXT:    vsrl.vv v14, v10, v14
-; RV32-NEXT:    vmulhu.vv v14, v14, v16
-; RV32-NEXT:    vsub.vv v10, v10, v14
+; RV32-NEXT:    vsext.vf2 v8, v16
+; RV32-NEXT:    vsrl.vv v8, v10, v8
+; RV32-NEXT:    vmulhu.vv v8, v8, v14
+; RV32-NEXT:    vsub.vv v10, v10, v8
 ; RV32-NEXT:    vmulhu.vv v10, v10, v12
-; RV32-NEXT:    vadd.vv v10, v10, v14
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsrl.vv v8, v10, v12
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsext.vf2 v10, v17
+; RV32-NEXT:    vsrl.vv v8, v8, v10
 ; RV32-NEXT:    vse16.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
@@ -3263,12 +3263,12 @@ define void @mulhu_v16i16(ptr %x) {
 ; RV64-NEXT:    li a1, 1
 ; RV64-NEXT:    slli a1, a1, 48
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v14, a1
+; RV64-NEXT:    vmv.v.x v16, a1
 ; RV64-NEXT:    lui a1, %hi(.LCPI182_1)
 ; RV64-NEXT:    ld a1, %lo(.LCPI182_1)(a1)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v14
-; RV64-NEXT:    vsrl.vv v14, v8, v16
+; RV64-NEXT:    vsext.vf2 v14, v16
+; RV64-NEXT:    vsrl.vv v14, v8, v14
 ; RV64-NEXT:    vmulhu.vv v12, v14, v12
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v14, a1
@@ -3305,11 +3305,11 @@ define void @mulhu_v8i32(ptr %x) {
 ; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vmulhu.vv v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmv.v.x v14, a1
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    vsext.vf4 v12, v10
-; CHECK-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-NEXT:    vsext.vf4 v10, v14
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x i32>, ptr %x
@@ -3482,26 +3482,26 @@ define void @mulhs_v4i64(ptr %x) {
 ; RV32-NEXT:    vmv.s.x v0, a2
 ; RV32-NEXT:    lui a2, 1048560
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a2
+; RV32-NEXT:    vmv.v.x v14, a2
 ; RV32-NEXT:    addi a2, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a2
+; RV32-NEXT:    vmv.v.x v10, a2
 ; RV32-NEXT:    li a2, 63
 ; RV32-NEXT:    addi a1, a1, 1366
-; RV32-NEXT:    vmerge.vxm v12, v12, a1, v0
+; RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    vsext.vf4 v14, v10
+; RV32-NEXT:    vsext.vf4 v12, v14
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vmulh.vv v10, v8, v12
-; RV32-NEXT:    vmadd.vv v14, v8, v10
+; RV32-NEXT:    vmulh.vv v10, v8, v10
+; RV32-NEXT:    vmadd.vv v12, v8, v10
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf4 v10, v8
+; RV32-NEXT:    vsext.vf4 v8, v10
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v14, a2
-; RV32-NEXT:    vsra.vv v10, v14, v10
-; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vx v10, v12, a2
+; RV32-NEXT:    vsra.vv v8, v12, v8
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
@@ -3513,7 +3513,7 @@ define void @mulhs_v4i64(ptr %x) {
 ; RV64-NEXT:    lui a2, 1044496
 ; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    addi a2, a2, -256
-; RV64-NEXT:    vmv.s.x v10, a2
+; RV64-NEXT:    vmv.s.x v14, a2
 ; RV64-NEXT:    slli a2, a1, 32
 ; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    lui a2, %hi(.LCPI188_0)
@@ -3521,19 +3521,19 @@ define void @mulhs_v4i64(ptr %x) {
 ; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; RV64-NEXT:    vmv.v.i v0, 5
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v12, a1
+; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    li a1, 63
-; RV64-NEXT:    vmerge.vxm v12, v12, a2, v0
+; RV64-NEXT:    vmerge.vxm v10, v10, a2, v0
 ; RV64-NEXT:    lui a2, 4096
 ; RV64-NEXT:    addi a2, a2, 256
-; RV64-NEXT:    vsext.vf8 v14, v10
-; RV64-NEXT:    vmulh.vv v10, v8, v12
-; RV64-NEXT:    vmadd.vv v14, v8, v10
-; RV64-NEXT:    vmv.s.x v8, a2
-; RV64-NEXT:    vsext.vf8 v10, v8
-; RV64-NEXT:    vsrl.vx v8, v14, a1
-; RV64-NEXT:    vsra.vv v10, v14, v10
-; RV64-NEXT:    vadd.vv v8, v10, v8
+; RV64-NEXT:    vsext.vf8 v12, v14
+; RV64-NEXT:    vmulh.vv v10, v8, v10
+; RV64-NEXT:    vmadd.vv v12, v8, v10
+; RV64-NEXT:    vmv.s.x v10, a2
+; RV64-NEXT:    vsext.vf8 v8, v10
+; RV64-NEXT:    vsrl.vx v10, v12, a1
+; RV64-NEXT:    vsra.vv v8, v12, v8
+; RV64-NEXT:    vadd.vv v8, v8, v10
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
   %a = load <4 x i64>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index e93b264c5f4f0..954a5889f31ba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -183,10 +183,10 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 100
+; RV32-NEXT:    li a3, 96
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 100 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 96 * vlenb
 ; RV32-NEXT:    addi a4, a1, 128
 ; RV32-NEXT:    addi a5, a1, 256
 ; RV32-NEXT:    li a2, 32
@@ -197,7 +197,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vmv.s.x v3, a3
 ; RV32-NEXT:    vle32.v v24, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a6, 84
+; RV32-NEXT:    li a6, 80
 ; RV32-NEXT:    mul a1, a1, a6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -205,7 +205,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vslideup.vi v8, v16, 4
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a6, 60
+; RV32-NEXT:    li a6, 56
 ; RV32-NEXT:    mul a1, a1, a6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -213,14 +213,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v16, v16, 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a6, 92
+; RV32-NEXT:    li a6, 88
 ; RV32-NEXT:    mul a1, a1, a6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a6, 92
+; RV32-NEXT:    li a6, 88
 ; RV32-NEXT:    mul a1, a1, a6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -228,7 +228,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v8, v16, 10, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a6, 72
+; RV32-NEXT:    li a6, 68
 ; RV32-NEXT:    mul a1, a1, a6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -238,7 +238,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a5, a5, 3
 ; RV32-NEXT:    vmv.s.x v0, a5
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 60
+; RV32-NEXT:    li a4, 56
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -249,7 +249,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 92
+; RV32-NEXT:    li a4, 88
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -257,8 +257,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v4, v16, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 68
-; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
@@ -274,20 +273,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v24, v6
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 52
+; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 76
+; RV32-NEXT:    li a4, 72
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 84
+; RV32-NEXT:    li a4, 80
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -297,7 +296,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v8, v4
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 44
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -313,13 +312,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vmv.s.x v0, a5
 ; RV32-NEXT:    addi a6, a6, 3
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li t1, 84
+; RV32-NEXT:    li t1, 80
 ; RV32-NEXT:    mul a5, a5, t1
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li t1, 76
+; RV32-NEXT:    li t1, 72
 ; RV32-NEXT:    mul a5, a5, t1
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
@@ -334,35 +333,29 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vmv.s.x v0, a6
 ; RV32-NEXT:    addi a5, a7, 192
 ; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    li a7, 92
+; RV32-NEXT:    li a7, 88
 ; RV32-NEXT:    mul a6, a6, a7
 ; RV32-NEXT:    add a6, sp, a6
 ; RV32-NEXT:    addi a6, a6, 16
 ; RV32-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    li a7, 60
+; RV32-NEXT:    li a7, 56
 ; RV32-NEXT:    mul a6, a6, a7
 ; RV32-NEXT:    add a6, sp, a6
 ; RV32-NEXT:    addi a6, a6, 16
 ; RV32-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v8, v8, v24, v0
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    li a7, 28
-; RV32-NEXT:    mul a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs4r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vmerge.vvm v4, v8, v24, v0
 ; RV32-NEXT:    vmv.s.x v0, a5
 ; RV32-NEXT:    addi a3, a3, 12
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 84
+; RV32-NEXT:    li a6, 80
 ; RV32-NEXT:    mul a5, a5, a6
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 76
+; RV32-NEXT:    li a6, 72
 ; RV32-NEXT:    mul a5, a5, a6
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
@@ -370,15 +363,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 36
-; RV32-NEXT:    mul a5, a5, a6
+; RV32-NEXT:    slli a5, a5, 5
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv.s.x v0, a3
 ; RV32-NEXT:    addi a3, t0, 768
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 92
+; RV32-NEXT:    li a6, 88
 ; RV32-NEXT:    mul a5, a5, a6
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
@@ -386,20 +378,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v8, v24, v0
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 5
+; RV32-NEXT:    li a6, 28
+; RV32-NEXT:    mul a5, a5, a6
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs4r.v v8, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv.s.x v0, a3
 ; RV32-NEXT:    lui a3, 3073
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 84
+; RV32-NEXT:    li a6, 80
 ; RV32-NEXT:    mul a5, a5, a6
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 76
+; RV32-NEXT:    li a6, 72
 ; RV32-NEXT:    mul a5, a5, a6
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
@@ -413,22 +406,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv.s.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle16.v v2, (a1)
+; RV32-NEXT:    vle16.v v28, (a1)
 ; RV32-NEXT:    addi a1, a3, -1024
 ; RV32-NEXT:    vmv4r.v v8, v24
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 92
+; RV32-NEXT:    li a4, 88
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vmerge.vvm v12, v16, v24, v0
+; RV32-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs4r.v v12, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
@@ -436,17 +429,17 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v2
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v28
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 84
+; RV32-NEXT:    li a3, 80
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -454,7 +447,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 84
+; RV32-NEXT:    li a2, 80
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -466,7 +459,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 92
+; RV32-NEXT:    li a2, 88
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -474,19 +467,19 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 92
+; RV32-NEXT:    li a2, 88
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 72
+; RV32-NEXT:    li a2, 68
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 52
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -494,77 +487,69 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v16, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 72
+; RV32-NEXT:    li a2, 68
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 68
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 44
+; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmv.v.v v16, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 68
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 28
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v4, v8, v12
+; RV32-NEXT:    vrgatherei16.vv v20, v4, v12
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v4, v8
+; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_4)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_4)
 ; RV32-NEXT:    lui a2, %hi(.LCPI8_5)
 ; RV32-NEXT:    addi a2, a2, %lo(.LCPI8_5)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v0, (a1)
+; RV32-NEXT:    vle16.v v16, (a1)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vle16.v v16, (a2)
+; RV32-NEXT:    vle16.v v24, (a2)
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_6)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_6)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle16.v v2, (a1)
+; RV32-NEXT:    vle16.v v18, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 36
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v0
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 28
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v24, v12, v16
+; RV32-NEXT:    vrgatherei16.vv v0, v12, v24
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v24, v8
+; RV32-NEXT:    vmv.v.v v0, v8
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v2
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v18
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_7)
 ; RV32-NEXT:    lui a2, %hi(.LCPI8_8)
@@ -574,62 +559,85 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_9)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v28, (a2)
+; RV32-NEXT:    vle16.v v14, (a2)
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vle16.v v30, (a1)
+; RV32-NEXT:    vle16.v v13, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs1r.v v13, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v0, v16, v12
+; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v16, v4, v12
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v0, v8
+; RV32-NEXT:    vmv.v.v v16, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 84
+; RV32-NEXT:    li a2, 80
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 72
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl2r.v v6, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v28
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v6
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 92
+; RV32-NEXT:    li a2, 88
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl1r.v v28, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v12, v16, v30
+; RV32-NEXT:    vrgatherei16.vv v12, v24, v28
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v12, v8
 ; RV32-NEXT:    addi a1, a0, 320
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vse32.v v12, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    vse32.v v0, (a1)
+; RV32-NEXT:    vse32.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 192
-; RV32-NEXT:    vse32.v v24, (a1)
+; RV32-NEXT:    vse32.v v0, (a1)
 ; RV32-NEXT:    addi a1, a0, 128
-; RV32-NEXT:    vse32.v v4, (a1)
+; RV32-NEXT:    vse32.v v20, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 68
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 6
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 72
+; RV32-NEXT:    li a2, 68
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 100
+; RV32-NEXT:    li a1, 96
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
@@ -642,14 +650,15 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 95
+; RV64-NEXT:    li a3, 93
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xdf, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 95 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xdd, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 93 * vlenb
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a1)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    li a3, 53
+; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -662,7 +671,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi a3, a3, %lo(.LCPI8_0)
 ; RV64-NEXT:    vmv.s.x v0, a4
 ; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    li a5, 71
+; RV64-NEXT:    li a5, 61
 ; RV64-NEXT:    mul a4, a4, a5
 ; RV64-NEXT:    add a4, sp, a4
 ; RV64-NEXT:    addi a4, a4, 16
@@ -673,21 +682,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v16, v8, 8
 ; RV64-NEXT:    csrr a5, vlenb
-; RV64-NEXT:    slli a6, a5, 6
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    li a6, 77
+; RV64-NEXT:    mul a5, a5, a6
 ; RV64-NEXT:    add a5, sp, a5
 ; RV64-NEXT:    addi a5, a5, 16
 ; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a5, vlenb
-; RV64-NEXT:    slli a6, a5, 6
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    li a6, 77
+; RV64-NEXT:    mul a5, a5, a6
 ; RV64-NEXT:    add a5, sp, a5
 ; RV64-NEXT:    addi a5, a5, 16
 ; RV64-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vslideup.vi v24, v16, 5, v0.t
 ; RV64-NEXT:    csrr a5, vlenb
-; RV64-NEXT:    li a6, 83
+; RV64-NEXT:    li a6, 73
 ; RV64-NEXT:    mul a5, a5, a6
 ; RV64-NEXT:    add a5, sp, a5
 ; RV64-NEXT:    addi a5, a5, 16
@@ -695,7 +704,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v24, (a2)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a5, 87
+; RV64-NEXT:    li a5, 85
 ; RV64-NEXT:    mul a2, a2, a5
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
@@ -703,20 +712,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vle16.v v12, (a3)
 ; RV64-NEXT:    vmv.s.x v0, a4
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 87
+; RV64-NEXT:    li a3, 85
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    li a3, 53
+; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; RV64-NEXT:    vrgatherei16.vv v0, v24, v12
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 47
+; RV64-NEXT:    li a3, 37
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
@@ -724,21 +734,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vslideup.vi v12, v8, 1
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 71
+; RV64-NEXT:    li a3, 61
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl1r.v v7, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 6
-; RV64-NEXT:    sub a2, a3, a2
+; RV64-NEXT:    li a3, 77
+; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vslideup.vi v12, v24, 4, v0.t
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 79
+; RV64-NEXT:    li a3, 69
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
@@ -751,7 +761,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi a2, a3, 260
 ; RV64-NEXT:    vmv8r.v v24, v16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a5, 87
+; RV64-NEXT:    li a5, 85
 ; RV64-NEXT:    mul a3, a3, a5
 ; RV64-NEXT:    add a3, sp, a3
 ; RV64-NEXT:    addi a3, a3, 16
@@ -759,8 +769,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a5, 18
-; RV64-NEXT:    mul a3, a3, a5
+; RV64-NEXT:    slli a3, a3, 3
 ; RV64-NEXT:    add a3, sp, a3
 ; RV64-NEXT:    addi a3, a3, 16
 ; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -768,21 +777,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vmv.s.x v2, a4
 ; RV64-NEXT:    vmv4r.v v12, v8
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 87
+; RV64-NEXT:    li a3, 85
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 39
+; RV64-NEXT:    li a3, 29
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v2
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
+; RV64-NEXT:    li a3, 45
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
@@ -791,15 +800,15 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vslideup.vi v12, v8, 5, v0.t
 ; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 6
-; RV64-NEXT:    sub a2, a3, a2
+; RV64-NEXT:    li a3, 77
+; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v12, v16, 4, v0.t
+; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vi v12, v24, 4, v0.t
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 75
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a3, a2, 6
+; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
@@ -807,133 +816,139 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vmv1r.v v0, v2
 ; RV64-NEXT:    vslideup.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vrgather.vi v12, v16, 5, v0.t
+; RV64-NEXT:    vrgather.vi v12, v24, 5, v0.t
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 35
+; RV64-NEXT:    li a3, 25
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv4r.v v8, v16
 ; RV64-NEXT:    lui a2, 8
 ; RV64-NEXT:    addi a2, a2, 520
 ; RV64-NEXT:    vmv.s.x v0, a2
-; RV64-NEXT:    vslideup.vi v12, v16, 6
+; RV64-NEXT:    vslideup.vi v12, v24, 6
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 87
+; RV64-NEXT:    li a3, 85
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 53
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 27
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a3, a2, 4
+; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 77
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vslideup.vi v12, v8, 1, v0.t
+; RV64-NEXT:    vslideup.vi v12, v16, 1, v0.t
 ; RV64-NEXT:    lui a2, %hi(.LCPI8_1)
 ; RV64-NEXT:    addi a2, a2, %lo(.LCPI8_1)
 ; RV64-NEXT:    li a3, 192
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vle16.v v16, (a2)
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vle16.v v6, (a2)
 ; RV64-NEXT:    vmv.s.x v0, a3
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 26
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a2, a2, 4
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
+; RV64-NEXT:    li a3, 45
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vrgather.vi v20, v16, 2
-; RV64-NEXT:    vmerge.vvm v12, v20, v12, v0
+; RV64-NEXT:    vrgather.vi v28, v16, 2
+; RV64-NEXT:    vmerge.vvm v16, v28, v12, v0
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 71
+; RV64-NEXT:    li a3, 61
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 18
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vl2r.v v12, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v0, v16, v12
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 10
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v6
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    lui a2, %hi(.LCPI8_2)
 ; RV64-NEXT:    addi a2, a2, %lo(.LCPI8_2)
 ; RV64-NEXT:    li a3, 1040
 ; RV64-NEXT:    vmv.s.x v0, a3
 ; RV64-NEXT:    addi a1, a1, -2016
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 87
+; RV64-NEXT:    li a4, 85
 ; RV64-NEXT:    mul a3, a3, a4
 ; RV64-NEXT:    add a3, sp, a3
 ; RV64-NEXT:    addi a3, a3, 16
-; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV64-NEXT:    vmerge.vvm v16, v16, v24, v0
+; RV64-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 18
+; RV64-NEXT:    li a4, 53
 ; RV64-NEXT:    mul a3, a3, a4
 ; RV64-NEXT:    add a3, sp, a3
 ; RV64-NEXT:    addi a3, a3, 16
-; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vmerge.vvm v8, v24, v16, v0
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    vle16.v v12, (a2)
+; RV64-NEXT:    vle16.v v6, (a2)
 ; RV64-NEXT:    li a1, 64
+; RV64-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 87
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vmerge.vvm v16, v16, v24, v0
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 87
+; RV64-NEXT:    li a3, 85
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 39
+; RV64-NEXT:    li a2, 29
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v24, v16, v12
-; RV64-NEXT:    vmv4r.v v12, v8
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v6
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 77
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv4r.v v28, v8
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vslideup.vi v12, v8, 5, v0.t
+; RV64-NEXT:    vslideup.vi v28, v8, 5, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 83
+; RV64-NEXT:    li a2, 73
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 47
+; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
@@ -941,26 +956,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
 ; RV64-NEXT:    vmv.v.v v8, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 83
+; RV64-NEXT:    li a2, 73
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 10
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 79
+; RV64-NEXT:    li a2, 69
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v8, v0
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v8, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 79
+; RV64-NEXT:    li a2, 69
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
@@ -968,128 +979,138 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    lui a1, %hi(.LCPI8_3)
 ; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_3)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vle16.v v8, (a1)
+; RV64-NEXT:    vle16.v v20, (a1)
 ; RV64-NEXT:    lui a1, %hi(.LCPI8_4)
 ; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_4)
-; RV64-NEXT:    vle16.v v10, (a1)
+; RV64-NEXT:    vle16.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 75
+; RV64-NEXT:    li a2, 77
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vs2r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 6
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v16, v24
+; RV64-NEXT:    vmv.v.v v8, v24
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 75
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 6
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 27
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v24, v0, v8
+; RV64-NEXT:    vrgatherei16.vv v0, v8, v20
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 35
+; RV64-NEXT:    li a2, 25
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v4, v24
+; RV64-NEXT:    vmv.v.v v12, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 18
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 77
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v24, v16, v10
+; RV64-NEXT:    vrgatherei16.vv v0, v16, v8
 ; RV64-NEXT:    lui a1, %hi(.LCPI8_5)
 ; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_5)
-; RV64-NEXT:    vle16.v v8, (a1)
+; RV64-NEXT:    vle16.v v20, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 71
+; RV64-NEXT:    li a2, 61
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v28, v24
+; RV64-NEXT:    vmv.v.v v8, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 71
+; RV64-NEXT:    li a2, 61
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 55
+; RV64-NEXT:    li a2, 45
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vrgather.vi v28, v24, 3
+; RV64-NEXT:    vrgather.vi v8, v0, 3
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 26
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmerge.vvm v12, v28, v12, v0
+; RV64-NEXT:    vmerge.vvm v8, v8, v28, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 87
+; RV64-NEXT:    li a2, 85
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v24, v16, v8
+; RV64-NEXT:    vrgatherei16.vv v24, v0, v20
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v24
+; RV64-NEXT:    vmv.v.v v8, v24
 ; RV64-NEXT:    addi a1, a0, 256
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 71
+; RV64-NEXT:    li a3, 61
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v20, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vse64.v v20, (a1)
 ; RV64-NEXT:    addi a1, a0, 320
-; RV64-NEXT:    vse64.v v12, (a1)
+; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    addi a1, a0, 192
-; RV64-NEXT:    vse64.v v4, (a1)
+; RV64-NEXT:    vse64.v v12, (a1)
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 75
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a3, a2, 6
+; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    addi a1, a0, 64
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 79
+; RV64-NEXT:    li a3, 69
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 83
+; RV64-NEXT:    li a2, 73
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 95
+; RV64-NEXT:    li a1, 93
 ; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
@@ -1290,17 +1311,17 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) {
 ; CHECK-LABEL: load_factor4_one_active_storeback_full:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v12, (a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v8, 4
-; CHECK-NEXT:    vmv1r.v v13, v8
-; CHECK-NEXT:    vmv1r.v v14, v12
+; CHECK-NEXT:    vslidedown.vi v8, v12, 4
+; CHECK-NEXT:    vmv1r.v v17, v12
+; CHECK-NEXT:    vmv1r.v v18, v8
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v8, 8
-; CHECK-NEXT:    vmv1r.v v15, v16
-; CHECK-NEXT:    vslidedown.vi v16, v8, 12
+; CHECK-NEXT:    vslidedown.vi v8, v12, 8
+; CHECK-NEXT:    vmv1r.v v19, v8
+; CHECK-NEXT:    vslidedown.vi v20, v12, 12
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vsseg4e32.v v13, (a0)
+; CHECK-NEXT:    vsseg4e32.v v17, (a0)
 ; CHECK-NEXT:    ret
   %interleaved.vec = load <16 x i32>, ptr %ptr
   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-inttoptr-ptrtoint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-inttoptr-ptrtoint.ll
index 03ce5bea55665..8b6270e86af36 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-inttoptr-ptrtoint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-inttoptr-ptrtoint.ll
@@ -7,8 +7,8 @@ define <4 x ptr> @inttoptr_v4p0_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %e
 ; CHECK-LABEL: inttoptr_v4p0_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf2 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x ptr> @llvm.vp.inttoptr.v4p0.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
   ret <4 x ptr> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint-vp.ll
index 8282b8884aed6..466fe744a1376 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint-vp.ll
@@ -46,15 +46,15 @@ define <3 x i64> @llrint_v3i64_v3f32(<3 x float> %x, <3 x i1> %m, i32 zeroext %e
 ; RV32-LABEL: llrint_v3i64_v3f32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; RV32-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    vmv1r.v v10, v8
+; RV32-NEXT:    vfwcvt.x.f.v v8, v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: llrint_v3i64_v3f32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; RV64-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    vmv1r.v v10, v8
+; RV64-NEXT:    vfwcvt.x.f.v v8, v10, v0.t
 ; RV64-NEXT:    ret
   %a = call <3 x i64> @llvm.vp.llrint.v3i64.v3f32(<3 x float> %x, <3 x i1> %m, i32 %evl)
   ret <3 x i64> %a
@@ -65,15 +65,15 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x, <4 x i1> %m, i32 zeroext %e
 ; RV32-LABEL: llrint_v4i64_v4f32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; RV32-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    vmv1r.v v10, v8
+; RV32-NEXT:    vfwcvt.x.f.v v8, v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: llrint_v4i64_v4f32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; RV64-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    vmv1r.v v10, v8
+; RV64-NEXT:    vfwcvt.x.f.v v8, v10, v0.t
 ; RV64-NEXT:    ret
   %a = call <4 x i64> @llvm.vp.llrint.v4i64.v4f32(<4 x float> %x, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %a
@@ -84,15 +84,15 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x, <8 x i1> %m, i32 zeroext %e
 ; RV32-LABEL: llrint_v8i64_v8f32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; RV32-NEXT:    vfwcvt.x.f.v v12, v8, v0.t
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vfwcvt.x.f.v v8, v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: llrint_v8i64_v8f32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; RV64-NEXT:    vfwcvt.x.f.v v12, v8, v0.t
-; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    vmv2r.v v12, v8
+; RV64-NEXT:    vfwcvt.x.f.v v8, v12, v0.t
 ; RV64-NEXT:    ret
   %a = call <8 x i64> @llvm.vp.llrint.v8i64.v8f32(<8 x float> %x, <8 x i1> %m, i32 %evl)
   ret <8 x i64> %a
@@ -103,15 +103,15 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x, <16 x i1> %m, i32 zeroe
 ; RV32-LABEL: llrint_v16i64_v16f32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; RV32-NEXT:    vfwcvt.x.f.v v16, v8, v0.t
-; RV32-NEXT:    vmv8r.v v8, v16
+; RV32-NEXT:    vmv4r.v v16, v8
+; RV32-NEXT:    vfwcvt.x.f.v v8, v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: llrint_v16i64_v16f32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; RV64-NEXT:    vfwcvt.x.f.v v16, v8, v0.t
-; RV64-NEXT:    vmv8r.v v8, v16
+; RV64-NEXT:    vmv4r.v v16, v8
+; RV64-NEXT:    vfwcvt.x.f.v v8, v16, v0.t
 ; RV64-NEXT:    ret
   %a = call <16 x i64> @llvm.vp.llrint.v16i64.v16f32(<16 x float> %x, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
index f27614c93985f..9bd116b2303c8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
@@ -189,18 +189,18 @@ define <3 x i64> @llrint_v3i64_v3f32(<3 x float> %x) {
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    vslidedown.vi v11, v8, 3
 ; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a0
+; RV64-NEXT:    vmv.v.x v8, a0
 ; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    vfmv.f.s fa5, v11
 ; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vslide1down.vx v8, v8, a0
@@ -298,18 +298,18 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    vslidedown.vi v11, v8, 3
 ; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a0
+; RV64-NEXT:    vmv.v.x v8, a0
 ; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    vfmv.f.s fa5, v11
 ; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vslide1down.vx v8, v8, a0
@@ -618,42 +618,42 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV64-NEXT:    vfmv.f.s fa4, v8
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vslidedown.vi v11, v8, 2
+; RV64-NEXT:    vslidedown.vi v12, v8, 2
 ; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    sd a0, 248(sp)
 ; RV64-NEXT:    flw fa5, 120(sp)
-; RV64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-NEXT:    vslidedown.vi v13, v8, 1
 ; RV64-NEXT:    fcvt.l.s a0, fa4
 ; RV64-NEXT:    vfmv.f.s fa4, v10
 ; RV64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-NEXT:    sd a1, 240(sp)
 ; RV64-NEXT:    flw fa5, 116(sp)
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v14, v8, 7
+; RV64-NEXT:    vslidedown.vi v10, v8, 7
 ; RV64-NEXT:    fcvt.l.s a1, fa4
-; RV64-NEXT:    vfmv.f.s fa4, v11
+; RV64-NEXT:    vfmv.f.s fa4, v12
 ; RV64-NEXT:    fcvt.l.s a2, fa5
 ; RV64-NEXT:    sd a2, 232(sp)
 ; RV64-NEXT:    flw fa5, 112(sp)
 ; RV64-NEXT:    fcvt.l.s a2, fa4
-; RV64-NEXT:    vfmv.f.s fa4, v12
-; RV64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-NEXT:    vfmv.f.s fa4, v13
+; RV64-NEXT:    vslidedown.vi v12, v8, 6
 ; RV64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-NEXT:    sd a3, 224(sp)
 ; RV64-NEXT:    flw fa5, 108(sp)
 ; RV64-NEXT:    fcvt.l.s a3, fa4
-; RV64-NEXT:    vfmv.f.s fa4, v14
-; RV64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-NEXT:    vfmv.f.s fa4, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 5
 ; RV64-NEXT:    fcvt.l.s a4, fa5
 ; RV64-NEXT:    sd a4, 216(sp)
 ; RV64-NEXT:    flw fa5, 104(sp)
 ; RV64-NEXT:    fcvt.l.s a4, fa4
-; RV64-NEXT:    vfmv.f.s fa4, v10
+; RV64-NEXT:    vfmv.f.s fa4, v12
 ; RV64-NEXT:    fcvt.l.s a5, fa4
 ; RV64-NEXT:    fcvt.l.s a6, fa5
 ; RV64-NEXT:    sd a6, 208(sp)
 ; RV64-NEXT:    flw fa5, 100(sp)
-; RV64-NEXT:    vfmv.f.s fa4, v12
+; RV64-NEXT:    vfmv.f.s fa4, v10
 ; RV64-NEXT:    fcvt.l.s a6, fa4
 ; RV64-NEXT:    vslidedown.vi v8, v8, 4
 ; RV64-NEXT:    fcvt.l.s a7, fa5
@@ -866,15 +866,15 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
 ; RV64-LABEL: llrint_v4i64_v4f64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-NEXT:    vslidedown.vi v12, v8, 1
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
 ; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.d a1, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v12
+; RV64-NEXT:    fcvt.l.d a1, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    fcvt.l.d a0, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint-vp.ll
index 1c920e42f7d41..5b5163c17a5c9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint-vp.ll
@@ -70,8 +70,8 @@ define <3 x iXLen> @lrint_v3f32(<3 x float> %x, <3 x i1> %m, i32 zeroext %evl) {
 ; RV64-i64-LABEL: lrint_v3f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; RV64-i64-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
-; RV64-i64-NEXT:    vmv2r.v v8, v10
+; RV64-i64-NEXT:    vmv1r.v v10, v8
+; RV64-i64-NEXT:    vfwcvt.x.f.v v8, v10, v0.t
 ; RV64-i64-NEXT:    ret
   %a = call <3 x iXLen> @llvm.vp.lrint.v3iXLen.v3f32(<3 x float> %x, <3 x i1> %m, i32 %evl)
   ret <3 x iXLen> %a
@@ -94,8 +94,8 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x, <4 x i1> %m, i32 zeroext %evl) {
 ; RV64-i64-LABEL: lrint_v4f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; RV64-i64-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
-; RV64-i64-NEXT:    vmv2r.v v8, v10
+; RV64-i64-NEXT:    vmv1r.v v10, v8
+; RV64-i64-NEXT:    vfwcvt.x.f.v v8, v10, v0.t
 ; RV64-i64-NEXT:    ret
   %a = call <4 x iXLen> @llvm.vp.lrint.v4iXLen.v4f32(<4 x float> %x, <4 x i1> %m, i32 %evl)
   ret <4 x iXLen> %a
@@ -118,8 +118,8 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x, <8 x i1> %m, i32 zeroext %evl) {
 ; RV64-i64-LABEL: lrint_v8f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; RV64-i64-NEXT:    vfwcvt.x.f.v v12, v8, v0.t
-; RV64-i64-NEXT:    vmv4r.v v8, v12
+; RV64-i64-NEXT:    vmv2r.v v12, v8
+; RV64-i64-NEXT:    vfwcvt.x.f.v v8, v12, v0.t
 ; RV64-i64-NEXT:    ret
   %a = call <8 x iXLen> @llvm.vp.lrint.v8iXLen.v8f32(<8 x float> %x, <8 x i1> %m, i32 %evl)
   ret <8 x iXLen> %a
@@ -142,8 +142,8 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x, <16 x i1> %m, i32 zeroext %ev
 ; RV64-i64-LABEL: lrint_v16f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; RV64-i64-NEXT:    vfwcvt.x.f.v v16, v8, v0.t
-; RV64-i64-NEXT:    vmv8r.v v8, v16
+; RV64-i64-NEXT:    vmv4r.v v16, v8
+; RV64-i64-NEXT:    vfwcvt.x.f.v v8, v16, v0.t
 ; RV64-i64-NEXT:    ret
   %a = call <16 x iXLen> @llvm.vp.lrint.v16iXLen.v16f32(<16 x float> %x, <16 x i1> %m, i32 %evl)
   ret <16 x iXLen> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
index 2f58e3dd2769f..27abd624f7765 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
@@ -125,18 +125,18 @@ define <3 x iXLen> @lrint_v3f32(<3 x float> %x) {
 ; RV64-i64-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i64-NEXT:    vslidedown.vi v11, v8, 3
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-i64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-i64-NEXT:    vmv.v.x v10, a0
+; RV64-i64-NEXT:    vmv.v.x v8, a0
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    vfmv.f.s fa5, v11
 ; RV64-i64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-i64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
@@ -193,18 +193,18 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
 ; RV64-i64-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i64-NEXT:    vslidedown.vi v11, v8, 3
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-i64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-i64-NEXT:    vmv.v.x v10, a0
+; RV64-i64-NEXT:    vmv.v.x v8, a0
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    vfmv.f.s fa5, v11
 ; RV64-i64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-i64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
@@ -537,42 +537,42 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
 ; RV64-i64-NEXT:    vfmv.f.s fa4, v8
 ; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-i64-NEXT:    vslidedown.vi v11, v8, 2
+; RV64-i64-NEXT:    vslidedown.vi v12, v8, 2
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    sd a0, 248(sp)
 ; RV64-i64-NEXT:    flw fa5, 120(sp)
-; RV64-i64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-i64-NEXT:    vslidedown.vi v13, v8, 1
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa4
 ; RV64-i64-NEXT:    vfmv.f.s fa4, v10
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i64-NEXT:    sd a1, 240(sp)
 ; RV64-i64-NEXT:    flw fa5, 116(sp)
 ; RV64-i64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v14, v8, 7
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 7
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa4
-; RV64-i64-NEXT:    vfmv.f.s fa4, v11
+; RV64-i64-NEXT:    vfmv.f.s fa4, v12
 ; RV64-i64-NEXT:    fcvt.l.s a2, fa5
 ; RV64-i64-NEXT:    sd a2, 232(sp)
 ; RV64-i64-NEXT:    flw fa5, 112(sp)
 ; RV64-i64-NEXT:    fcvt.l.s a2, fa4
-; RV64-i64-NEXT:    vfmv.f.s fa4, v12
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-i64-NEXT:    vfmv.f.s fa4, v13
+; RV64-i64-NEXT:    vslidedown.vi v12, v8, 6
 ; RV64-i64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-i64-NEXT:    sd a3, 224(sp)
 ; RV64-i64-NEXT:    flw fa5, 108(sp)
 ; RV64-i64-NEXT:    fcvt.l.s a3, fa4
-; RV64-i64-NEXT:    vfmv.f.s fa4, v14
-; RV64-i64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-i64-NEXT:    vfmv.f.s fa4, v10
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 5
 ; RV64-i64-NEXT:    fcvt.l.s a4, fa5
 ; RV64-i64-NEXT:    sd a4, 216(sp)
 ; RV64-i64-NEXT:    flw fa5, 104(sp)
 ; RV64-i64-NEXT:    fcvt.l.s a4, fa4
-; RV64-i64-NEXT:    vfmv.f.s fa4, v10
+; RV64-i64-NEXT:    vfmv.f.s fa4, v12
 ; RV64-i64-NEXT:    fcvt.l.s a5, fa4
 ; RV64-i64-NEXT:    fcvt.l.s a6, fa5
 ; RV64-i64-NEXT:    sd a6, 208(sp)
 ; RV64-i64-NEXT:    flw fa5, 100(sp)
-; RV64-i64-NEXT:    vfmv.f.s fa4, v12
+; RV64-i64-NEXT:    vfmv.f.s fa4, v10
 ; RV64-i64-NEXT:    fcvt.l.s a6, fa4
 ; RV64-i64-NEXT:    vslidedown.vi v8, v8, 4
 ; RV64-i64-NEXT:    fcvt.l.s a7, fa5
@@ -683,15 +683,15 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
 ; RV32-LABEL: lrint_v4f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    vslidedown.vi v12, v8, 1
 ; RV32-NEXT:    vfmv.f.s fa5, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    fcvt.w.d a0, fa5
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.d a1, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v12
+; RV32-NEXT:    fcvt.w.d a1, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    fcvt.w.d a0, fa5
@@ -707,15 +707,15 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
 ; RV64-i32-LABEL: lrint_v4f64:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-i32-NEXT:    vslidedown.vi v12, v8, 1
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-i32-NEXT:    vslidedown.vi v10, v8, 2
 ; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i32-NEXT:    fcvt.l.d a0, fa5
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.d a1, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v12
+; RV64-i32-NEXT:    fcvt.l.d a1, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v10
 ; RV64-i32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-i32-NEXT:    vmv.v.x v9, a0
 ; RV64-i32-NEXT:    fcvt.l.d a0, fa5
@@ -731,15 +731,15 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
 ; RV64-i64-LABEL: lrint_v4f64:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-i64-NEXT:    vslidedown.vi v12, v8, 1
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 2
 ; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.d a1, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v12
+; RV64-i64-NEXT:    fcvt.l.d a1, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-i64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-i64-NEXT:    vmv.v.x v10, a0
 ; RV64-i64-NEXT:    fcvt.l.d a0, fa5
@@ -768,17 +768,17 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    mv a0, sp
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 1
+; RV32-NEXT:    vslidedown.vi v14, v8, 1
 ; RV32-NEXT:    vfmv.f.s fa5, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v14, v8, 2
+; RV32-NEXT:    vslidedown.vi v12, v8, 2
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vfmv.f.s fa4, v12
+; RV32-NEXT:    vfmv.f.s fa4, v14
 ; RV32-NEXT:    fcvt.w.d a0, fa5
-; RV32-NEXT:    vfmv.f.s fa5, v14
+; RV32-NEXT:    vfmv.f.s fa5, v12
 ; RV32-NEXT:    vfmv.f.s fa3, v8
 ; RV32-NEXT:    fcvt.w.d a1, fa4
 ; RV32-NEXT:    fcvt.w.d a2, fa5
@@ -823,17 +823,17 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i32-NEXT:    andi sp, sp, -64
 ; RV64-i32-NEXT:    mv a0, sp
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-i32-NEXT:    vslidedown.vi v14, v8, 1
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v14, v8, 2
+; RV64-i32-NEXT:    vslidedown.vi v12, v8, 2
 ; RV64-i32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-i32-NEXT:    vse64.v v8, (a0)
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-i32-NEXT:    vfmv.f.s fa4, v12
+; RV64-i32-NEXT:    vfmv.f.s fa4, v14
 ; RV64-i32-NEXT:    fcvt.l.d a0, fa5
-; RV64-i32-NEXT:    vfmv.f.s fa5, v14
+; RV64-i32-NEXT:    vfmv.f.s fa5, v12
 ; RV64-i32-NEXT:    vfmv.f.s fa3, v8
 ; RV64-i32-NEXT:    fcvt.l.d a1, fa4
 ; RV64-i32-NEXT:    fcvt.l.d a2, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 837a8891dfeb9..76590d47a3230 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -3733,8 +3733,8 @@ define <4 x i64> @mgather_truemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru) {
 ; RV32V-LABEL: mgather_truemask_v4i64:
 ; RV32V:       # %bb.0:
 ; RV32V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32V-NEXT:    vluxei32.v v10, (zero), v8
-; RV32V-NEXT:    vmv.v.v v8, v10
+; RV32V-NEXT:    vmv1r.v v10, v8
+; RV32V-NEXT:    vluxei32.v v8, (zero), v10
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_truemask_v4i64:
@@ -11010,8 +11010,8 @@ define <4 x double> @mgather_truemask_v4f64(<4 x ptr> %ptrs, <4 x double> %passt
 ; RV32V-LABEL: mgather_truemask_v4f64:
 ; RV32V:       # %bb.0:
 ; RV32V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32V-NEXT:    vluxei32.v v10, (zero), v8
-; RV32V-NEXT:    vmv.v.v v8, v10
+; RV32V-NEXT:    vmv1r.v v10, v8
+; RV32V-NEXT:    vluxei32.v v8, (zero), v10
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_truemask_v4f64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 7354f9afa9a71..8f2672e8f40c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -12441,14 +12441,14 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64V-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RV64V-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64V-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
-; RV64V-NEXT:    vslidedown.vi v8, v8, 16
-; RV64V-NEXT:    vslidedown.vi v10, v10, 16
+; RV64V-NEXT:    vslidedown.vi v16, v8, 16
+; RV64V-NEXT:    vslidedown.vi v18, v10, 16
 ; RV64V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64V-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64V-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64V-NEXT:    vsext.vf8 v16, v10
+; RV64V-NEXT:    vsext.vf8 v8, v18
 ; RV64V-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV64V-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64V-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_v32i8:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index c6a8f2b50076e..2cebf14ec5aa4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -786,12 +786,12 @@ define float @vreduce_ord_fwadd_v64f32(ptr %x, float %s) {
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfmv.s.f v16, fa0
+; CHECK-NEXT:    vfmv.s.f v24, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v24, v8, a0
+; CHECK-NEXT:    vslidedown.vx v16, v8, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwredosum.vs v8, v8, v16
-; CHECK-NEXT:    vfwredosum.vs v8, v24, v8
+; CHECK-NEXT:    vfwredosum.vs v8, v8, v24
+; CHECK-NEXT:    vfwredosum.vs v8, v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1175,12 +1175,12 @@ define double @vreduce_ord_fwadd_v32f64(ptr %x, double %s) {
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m1, ta, ma
-; CHECK-NEXT:    vfmv.s.f v16, fa0
+; CHECK-NEXT:    vfmv.s.f v24, fa0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v8, 16
+; CHECK-NEXT:    vslidedown.vi v16, v8, 16
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vfwredosum.vs v8, v8, v16
-; CHECK-NEXT:    vfwredosum.vs v8, v24, v8
+; CHECK-NEXT:    vfwredosum.vs v8, v8, v24
+; CHECK-NEXT:    vfwredosum.vs v8, v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
index 707d1202aca0f..3c0a4fd89660d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
@@ -1522,15 +1522,15 @@ define i64 @vwreduce_add_v64i64(ptr %x) {
 ; RV32-NEXT:    vle32.v v16, (a0)
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v16, 16
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vwadd.vv v24, v16, v8
+; RV32-NEXT:    vwadd.vv v0, v16, v8
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vwadd.vv v16, v0, v8
+; RV32-NEXT:    vwadd.vv v16, v24, v8
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v16
+; RV32-NEXT:    vadd.vv v8, v0, v16
 ; RV32-NEXT:    vmv.s.x v16, zero
 ; RV32-NEXT:    vredsum.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -1588,15 +1588,15 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) {
 ; RV32-NEXT:    vle32.v v16, (a0)
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v16, 16
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vwaddu.vv v24, v16, v8
+; RV32-NEXT:    vwaddu.vv v0, v16, v8
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vwaddu.vv v16, v0, v8
+; RV32-NEXT:    vwaddu.vv v16, v24, v8
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v16
+; RV32-NEXT:    vadd.vv v8, v0, v16
 ; RV32-NEXT:    vmv.s.x v16, zero
 ; RV32-NEXT:    vredsum.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
index 8da605d35270d..c0a213034c95b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
@@ -76,11 +76,11 @@ define signext i32 @sad_16x8_as_i32(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vminu.vv v10, v8, v9
 ; CHECK-NEXT:    vmaxu.vv v8, v8, v9
-; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vsub.vv v12, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf4 v12, v8
-; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vredsum.vs v8, v12, v8
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v12
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
 entry:
@@ -117,22 +117,22 @@ define signext i32 @sad_2block_16xi8_as_i32(ptr %a, ptr %b, i32 signext %stridea
 ; CHECK-NEXT:    vle8.v v11, (a1)
 ; CHECK-NEXT:    vsub.vv v10, v10, v14
 ; CHECK-NEXT:    vminu.vv v14, v12, v13
-; CHECK-NEXT:    vmaxu.vv v12, v12, v13
-; CHECK-NEXT:    vwaddu.vv v16, v10, v8
-; CHECK-NEXT:    vsub.vv v8, v12, v14
+; CHECK-NEXT:    vmaxu.vv v15, v12, v13
+; CHECK-NEXT:    vwaddu.vv v12, v10, v8
+; CHECK-NEXT:    vsub.vv v8, v15, v14
 ; CHECK-NEXT:    vminu.vv v10, v9, v11
 ; CHECK-NEXT:    vmaxu.vv v9, v9, v11
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vzext.vf2 v14, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vsub.vv v8, v9, v10
+; CHECK-NEXT:    vsub.vv v16, v9, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vv v20, v12, v16
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vwaddu.wv v20, v20, v10
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
+; CHECK-NEXT:    vzext.vf2 v12, v16
+; CHECK-NEXT:    vwaddu.wv v8, v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vredsum.vs v8, v20, v8
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v12
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll
index 6b81b781a898f..c0a352e160193 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll
@@ -24,21 +24,21 @@ define <8 x float> @fpext_v8bf16(<8 x bfloat> %x) {
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, a7
 ; CHECK-NEXT:    vmv.s.x v9, a6
-; CHECK-NEXT:    vmv.s.x v10, a5
-; CHECK-NEXT:    vmv.s.x v12, a4
-; CHECK-NEXT:    vmv.s.x v11, a3
+; CHECK-NEXT:    vmv.s.x v11, a5
+; CHECK-NEXT:    vmv.s.x v10, a4
+; CHECK-NEXT:    vmv.s.x v12, a3
 ; CHECK-NEXT:    vmv.s.x v13, a2
 ; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    vmv.s.x v14, a1
-; CHECK-NEXT:    vslideup.vi v12, v10, 1
-; CHECK-NEXT:    vslideup.vi v13, v11, 1
+; CHECK-NEXT:    vslideup.vi v10, v11, 1
+; CHECK-NEXT:    vslideup.vi v13, v12, 1
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    vslideup.vi v8, v14, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v9, 2
+; CHECK-NEXT:    vslideup.vi v10, v9, 2
 ; CHECK-NEXT:    vslideup.vi v8, v13, 2
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 4
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
 ; CHECK-NEXT:    ret
   %y = fpext <8 x bfloat> %x to <8 x float>
   ret <8 x float> %y
@@ -66,21 +66,21 @@ define <8 x float> @fpext_v8f16(<8 x bfloat> %x) {
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, a7
 ; CHECK-NEXT:    vmv.s.x v9, a6
-; CHECK-NEXT:    vmv.s.x v10, a5
-; CHECK-NEXT:    vmv.s.x v12, a4
-; CHECK-NEXT:    vmv.s.x v11, a3
+; CHECK-NEXT:    vmv.s.x v11, a5
+; CHECK-NEXT:    vmv.s.x v10, a4
+; CHECK-NEXT:    vmv.s.x v12, a3
 ; CHECK-NEXT:    vmv.s.x v13, a2
 ; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    vmv.s.x v14, a1
-; CHECK-NEXT:    vslideup.vi v12, v10, 1
-; CHECK-NEXT:    vslideup.vi v13, v11, 1
+; CHECK-NEXT:    vslideup.vi v10, v11, 1
+; CHECK-NEXT:    vslideup.vi v13, v12, 1
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    vslideup.vi v8, v14, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v9, 2
+; CHECK-NEXT:    vslideup.vi v10, v9, 2
 ; CHECK-NEXT:    vslideup.vi v8, v13, 2
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 4
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
 ; CHECK-NEXT:    ret
   %y = fpext <8 x bfloat> %x to <8 x float>
   ret <8 x float> %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 90d4a5e511c87..8fef00dd597e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -412,12 +412,12 @@ define <8 x i1> @fcmp_one_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmflt.vv v12, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vmflt.vv v13, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v13, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -439,12 +439,12 @@ define <8 x i1> @fcmp_one_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v13, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -493,14 +493,14 @@ define <8 x i1> @fcmp_ord_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v10, v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmand.mm v0, v13, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -524,14 +524,14 @@ define <8 x i1> @fcmp_ord_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v10, v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmand.mm v0, v10, v13
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -576,12 +576,12 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmflt.vv v12, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vmflt.vv v13, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vmnor.mm v0, v13, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -603,12 +603,12 @@ define <8 x i1> @fcmp_ueq_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vmnor.mm v0, v13, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -649,12 +649,12 @@ define <8 x i1> @fcmp_ugt_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfle.vv v8, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vmnot.m v0, v8
+; ZVFHMIN-NEXT:    vmfle.vv v12, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vmnot.m v0, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -674,12 +674,12 @@ define <8 x i1> @fcmp_ugt_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vmnot.m v0, v8
+; ZVFHMIN-NEXT:    vmfle.vv v12, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vmnot.m v0, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -720,12 +720,12 @@ define <8 x i1> @fcmp_uge_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vmnot.m v0, v8
+; ZVFHMIN-NEXT:    vmflt.vv v12, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vmnot.m v0, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -745,12 +745,12 @@ define <8 x i1> @fcmp_uge_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vmnot.m v0, v8
+; ZVFHMIN-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vmnot.m v0, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -791,12 +791,12 @@ define <8 x i1> @fcmp_ult_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vmnot.m v0, v8
+; ZVFHMIN-NEXT:    vmfle.vv v12, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vmnot.m v0, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -816,12 +816,12 @@ define <8 x i1> @fcmp_ult_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfle.vv v8, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vmnot.m v0, v8
+; ZVFHMIN-NEXT:    vmfle.vv v12, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vmnot.m v0, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -862,12 +862,12 @@ define <8 x i1> @fcmp_ule_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vmnot.m v0, v8
+; ZVFHMIN-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vmnot.m v0, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -887,12 +887,12 @@ define <8 x i1> @fcmp_ule_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v12, v0.t
-; ZVFHMIN-NEXT:    vmnot.m v0, v8
+; ZVFHMIN-NEXT:    vmflt.vv v12, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vmnot.m v0, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -1009,14 +1009,14 @@ define <8 x i1> @fcmp_uno_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfne.vv v13, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfne.vv v10, v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v13, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -1040,14 +1040,14 @@ define <8 x i1> @fcmp_uno_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfne.vv v13, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfne.vv v10, v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v10, v13
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -1133,7 +1133,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    addi s0, sp, 896
 ; ZVFHMIN32-NEXT:    .cfi_def_cfa s0, 0
 ; ZVFHMIN32-NEXT:    csrr a1, vlenb
-; ZVFHMIN32-NEXT:    li a2, 28
+; ZVFHMIN32-NEXT:    li a2, 29
 ; ZVFHMIN32-NEXT:    mul a1, a1, a2
 ; ZVFHMIN32-NEXT:    sub sp, sp, a1
 ; ZVFHMIN32-NEXT:    andi sp, sp, -128
@@ -1305,49 +1305,49 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 7
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 27
+; ZVFHMIN32-NEXT:    li a3, 28
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 6
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 26
+; ZVFHMIN32-NEXT:    li a3, 27
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 5
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 25
+; ZVFHMIN32-NEXT:    li a3, 26
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 4
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 24
+; ZVFHMIN32-NEXT:    li a3, 25
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 3
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 23
+; ZVFHMIN32-NEXT:    li a3, 24
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 2
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 22
+; ZVFHMIN32-NEXT:    li a3, 23
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 1
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 21
+; ZVFHMIN32-NEXT:    li a3, 22
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
@@ -1359,8 +1359,8 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v30, v8, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v8, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v8, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v30, v8, 12
 ; ZVFHMIN32-NEXT:    vslidedown.vi v28, v8, 11
 ; ZVFHMIN32-NEXT:    vslidedown.vi v26, v8, 10
 ; ZVFHMIN32-NEXT:    vslidedown.vi v22, v8, 9
@@ -1373,50 +1373,44 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 560(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 304(sp)
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 7
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a3, a2, 1
-; ZVFHMIN32-NEXT:    add a2, a3, a2
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vslidedown.vi v21, v16, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v23, v16, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v29, v16, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v31, v16, 4
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 3
+; ZVFHMIN32-NEXT:    addi a2, sp, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v21, v16, 6
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v27, v16, 2
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 1
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 18
+; ZVFHMIN32-NEXT:    li a3, 19
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v23, v16, 4
-; ZVFHMIN32-NEXT:    vslidedown.vi v31, v16, 3
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 2
-; ZVFHMIN32-NEXT:    addi a2, sp, 848
-; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v7, v16, 1
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v14, v16, 15
 ; ZVFHMIN32-NEXT:    vslidedown.vi v12, v16, 14
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 13
 ; ZVFHMIN32-NEXT:    vslidedown.vi v18, v16, 12
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v16, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 10
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 19
+; ZVFHMIN32-NEXT:    li a3, 20
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v16, 9
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 9
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 14
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a3, a2, 4
+; ZVFHMIN32-NEXT:    sub a2, a3, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v16, 8
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 12
+; ZVFHMIN32-NEXT:    li a3, 13
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
@@ -1428,48 +1422,57 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 558(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 302(sp)
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v29, v0, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 7
 ; ZVFHMIN32-NEXT:    vslidedown.vi v19, v0, 6
-; ZVFHMIN32-NEXT:    vslidedown.vi v27, v0, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v3, v0, 5
 ; ZVFHMIN32-NEXT:    vslidedown.vi v13, v0, 4
 ; ZVFHMIN32-NEXT:    vslidedown.vi v15, v0, 3
-; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 2
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 2
 ; ZVFHMIN32-NEXT:    vslidedown.vi v11, v0, 1
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v0, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v0, 15
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 6
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a3, a2, 1
+; ZVFHMIN32-NEXT:    add a2, a3, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v0, 13
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v0, 14
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 2
+; ZVFHMIN32-NEXT:    slli a3, a2, 3
+; ZVFHMIN32-NEXT:    sub a2, a3, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v0, 12
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v0, 13
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 10
+; ZVFHMIN32-NEXT:    slli a3, a2, 2
+; ZVFHMIN32-NEXT:    add a2, a3, a2
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v0, 12
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 11
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v0, 11
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v0, 11
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 3
+; ZVFHMIN32-NEXT:    slli a3, a2, 3
+; ZVFHMIN32-NEXT:    add a2, a3, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v0, 10
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v0, 10
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 4
+; ZVFHMIN32-NEXT:    slli a3, a2, 4
+; ZVFHMIN32-NEXT:    add a2, a3, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v0, 9
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v0, 9
 ; ZVFHMIN32-NEXT:    vslidedown.vi v0, v0, 8
 ; ZVFHMIN32-NEXT:    vmv.x.s t3, v4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
@@ -1483,14 +1486,14 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v4, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s t4, v4
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v30
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v2
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 298(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v6
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v30
 ; ZVFHMIN32-NEXT:    vmv.x.s t0, v28
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
@@ -1553,12 +1556,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh t5, 736(sp)
 ; ZVFHMIN32-NEXT:    lh t6, 480(sp)
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 27
+; ZVFHMIN32-NEXT:    li a1, 28
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s5, 848(a0) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 26
+; ZVFHMIN32-NEXT:    li a1, 27
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s6, 848(a0) # 8-byte Folded Reload
@@ -1569,12 +1572,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh t5, 734(sp)
 ; ZVFHMIN32-NEXT:    lh t6, 478(sp)
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 25
+; ZVFHMIN32-NEXT:    li a1, 26
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s7, 848(a0) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 24
+; ZVFHMIN32-NEXT:    li a1, 25
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s8, 848(a0) # 8-byte Folded Reload
@@ -1585,12 +1588,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh t5, 732(sp)
 ; ZVFHMIN32-NEXT:    lh t6, 476(sp)
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 23
+; ZVFHMIN32-NEXT:    li a1, 24
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s4, 848(a0) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 22
+; ZVFHMIN32-NEXT:    li a1, 23
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s3, 848(a0) # 8-byte Folded Reload
@@ -1601,23 +1604,19 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh t6, 730(sp)
 ; ZVFHMIN32-NEXT:    lh s9, 474(sp)
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 21
+; ZVFHMIN32-NEXT:    li a1, 22
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s2, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    slli a1, a0, 1
-; ZVFHMIN32-NEXT:    add a0, a1, a0
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh t5, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v21
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, s9
 ; ZVFHMIN32-NEXT:    feq.h t6, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb t6, 173(sp)
 ; ZVFHMIN32-NEXT:    lh s9, 728(sp)
 ; ZVFHMIN32-NEXT:    lh s10, 472(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v21
-; ZVFHMIN32-NEXT:    vmv.x.s ra, v29
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v23
+; ZVFHMIN32-NEXT:    vmv.x.s ra, v9
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
 ; ZVFHMIN32-NEXT:    feq.h s9, fa5, fa4
@@ -1625,7 +1624,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh s9, 726(sp)
 ; ZVFHMIN32-NEXT:    lh s10, 470(sp)
 ; ZVFHMIN32-NEXT:    vmv.x.s a2, v19
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v27
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v3
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
 ; ZVFHMIN32-NEXT:    feq.h s9, fa5, fa4
@@ -1640,7 +1639,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb s10, 170(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 722(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 466(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s10, v9
+; ZVFHMIN32-NEXT:    vmv.x.s s10, v16
 ; ZVFHMIN32-NEXT:    vmv.x.s s11, v11
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
@@ -1724,27 +1723,27 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a1, 161(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 610(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 354(sp)
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 18
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s6, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s s5, v23
+; ZVFHMIN32-NEXT:    vmv.x.s s6, v29
+; ZVFHMIN32-NEXT:    vmv.x.s s5, v31
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 241(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 608(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 352(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s4, v31
-; ZVFHMIN32-NEXT:    lh s3, 848(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    lh s4, 848(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s3, v27
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 240(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 606(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 350(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s2, v7
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 19
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s2, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
@@ -1879,15 +1878,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 766(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 510(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 19
+; ZVFHMIN32-NEXT:    li a3, 20
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s s2, v8
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 14
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a3, a2, 4
+; ZVFHMIN32-NEXT:    sub a2, a3, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
@@ -1899,13 +1898,19 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 764(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 508(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 12
+; ZVFHMIN32-NEXT:    li a3, 13
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s t6, v8
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v16
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a3, a2, 1
+; ZVFHMIN32-NEXT:    add a2, a3, a2
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1913,14 +1918,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 762(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 506(sp)
 ; ZVFHMIN32-NEXT:    csrr a3, vlenb
-; ZVFHMIN32-NEXT:    li a4, 6
-; ZVFHMIN32-NEXT:    mul a3, a3, a4
+; ZVFHMIN32-NEXT:    slli a4, a3, 3
+; ZVFHMIN32-NEXT:    sub a3, a4, a3
 ; ZVFHMIN32-NEXT:    add a3, sp, a3
 ; ZVFHMIN32-NEXT:    addi a3, a3, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
 ; ZVFHMIN32-NEXT:    csrr a4, vlenb
-; ZVFHMIN32-NEXT:    slli a4, a4, 2
+; ZVFHMIN32-NEXT:    slli s3, a4, 2
+; ZVFHMIN32-NEXT:    add a4, s3, a4
 ; ZVFHMIN32-NEXT:    add a4, sp, a4
 ; ZVFHMIN32-NEXT:    addi a4, a4, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
@@ -1932,14 +1938,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 760(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 504(sp)
 ; ZVFHMIN32-NEXT:    csrr s3, vlenb
-; ZVFHMIN32-NEXT:    li s4, 10
+; ZVFHMIN32-NEXT:    li s4, 11
 ; ZVFHMIN32-NEXT:    mul s3, s3, s4
 ; ZVFHMIN32-NEXT:    add s3, sp, s3
 ; ZVFHMIN32-NEXT:    addi s3, s3, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s s6, v8
 ; ZVFHMIN32-NEXT:    csrr s3, vlenb
-; ZVFHMIN32-NEXT:    slli s3, s3, 3
+; ZVFHMIN32-NEXT:    slli s4, s3, 3
+; ZVFHMIN32-NEXT:    add s3, s4, s3
 ; ZVFHMIN32-NEXT:    add s3, sp, s3
 ; ZVFHMIN32-NEXT:    addi s3, s3, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
@@ -1951,12 +1958,13 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 758(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 502(sp)
 ; ZVFHMIN32-NEXT:    csrr s3, vlenb
-; ZVFHMIN32-NEXT:    slli s3, s3, 4
+; ZVFHMIN32-NEXT:    slli s5, s3, 4
+; ZVFHMIN32-NEXT:    add s3, s5, s3
 ; ZVFHMIN32-NEXT:    add s3, sp, s3
 ; ZVFHMIN32-NEXT:    addi s3, s3, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s s5, v8
-; ZVFHMIN32-NEXT:    vmv.x.s s3, v2
+; ZVFHMIN32-NEXT:    vmv.x.s s3, v6
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -2253,7 +2261,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    addi s0, sp, 896
 ; ZVFHMIN64-NEXT:    .cfi_def_cfa s0, 0
 ; ZVFHMIN64-NEXT:    csrr a1, vlenb
-; ZVFHMIN64-NEXT:    li a2, 28
+; ZVFHMIN64-NEXT:    li a2, 29
 ; ZVFHMIN64-NEXT:    mul a1, a1, a2
 ; ZVFHMIN64-NEXT:    sub sp, sp, a1
 ; ZVFHMIN64-NEXT:    andi sp, sp, -128
@@ -2425,49 +2433,49 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 7
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 27
+; ZVFHMIN64-NEXT:    li a3, 28
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 6
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 26
+; ZVFHMIN64-NEXT:    li a3, 27
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 5
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 25
+; ZVFHMIN64-NEXT:    li a3, 26
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 4
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 24
+; ZVFHMIN64-NEXT:    li a3, 25
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 3
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 23
+; ZVFHMIN64-NEXT:    li a3, 24
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 2
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 22
+; ZVFHMIN64-NEXT:    li a3, 23
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 1
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 21
+; ZVFHMIN64-NEXT:    li a3, 22
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
@@ -2479,8 +2487,8 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v30, v8, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v8, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v8, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v30, v8, 12
 ; ZVFHMIN64-NEXT:    vslidedown.vi v28, v8, 11
 ; ZVFHMIN64-NEXT:    vslidedown.vi v26, v8, 10
 ; ZVFHMIN64-NEXT:    vslidedown.vi v22, v8, 9
@@ -2493,50 +2501,44 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 560(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 304(sp)
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 7
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a3, a2, 1
-; ZVFHMIN64-NEXT:    add a2, a3, a2
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vslidedown.vi v21, v16, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v23, v16, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v29, v16, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v31, v16, 4
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 3
+; ZVFHMIN64-NEXT:    addi a2, sp, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v21, v16, 6
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v27, v16, 2
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 1
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 18
+; ZVFHMIN64-NEXT:    li a3, 19
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v23, v16, 4
-; ZVFHMIN64-NEXT:    vslidedown.vi v31, v16, 3
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 2
-; ZVFHMIN64-NEXT:    addi a2, sp, 800
-; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v7, v16, 1
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v14, v16, 15
 ; ZVFHMIN64-NEXT:    vslidedown.vi v12, v16, 14
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 13
 ; ZVFHMIN64-NEXT:    vslidedown.vi v18, v16, 12
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v16, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 10
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 19
+; ZVFHMIN64-NEXT:    li a3, 20
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v16, 9
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 9
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 14
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a3, a2, 4
+; ZVFHMIN64-NEXT:    sub a2, a3, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v16, 8
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 12
+; ZVFHMIN64-NEXT:    li a3, 13
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
@@ -2548,48 +2550,57 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 558(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 302(sp)
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v29, v0, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 7
 ; ZVFHMIN64-NEXT:    vslidedown.vi v19, v0, 6
-; ZVFHMIN64-NEXT:    vslidedown.vi v27, v0, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v3, v0, 5
 ; ZVFHMIN64-NEXT:    vslidedown.vi v13, v0, 4
 ; ZVFHMIN64-NEXT:    vslidedown.vi v15, v0, 3
-; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 2
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 2
 ; ZVFHMIN64-NEXT:    vslidedown.vi v11, v0, 1
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v0, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v0, 15
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 6
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a3, a2, 1
+; ZVFHMIN64-NEXT:    add a2, a3, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v0, 13
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v0, 14
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 2
+; ZVFHMIN64-NEXT:    slli a3, a2, 3
+; ZVFHMIN64-NEXT:    sub a2, a3, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v0, 12
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v0, 13
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 10
+; ZVFHMIN64-NEXT:    slli a3, a2, 2
+; ZVFHMIN64-NEXT:    add a2, a3, a2
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v0, 12
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 11
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v0, 11
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v0, 11
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 3
+; ZVFHMIN64-NEXT:    slli a3, a2, 3
+; ZVFHMIN64-NEXT:    add a2, a3, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v0, 10
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v0, 10
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 4
+; ZVFHMIN64-NEXT:    slli a3, a2, 4
+; ZVFHMIN64-NEXT:    add a2, a3, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v0, 9
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v0, 9
 ; ZVFHMIN64-NEXT:    vslidedown.vi v0, v0, 8
 ; ZVFHMIN64-NEXT:    vmv.x.s t3, v4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
@@ -2603,14 +2614,14 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v4, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s t4, v4
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v30
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v2
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 298(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v6
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v30
 ; ZVFHMIN64-NEXT:    vmv.x.s t0, v28
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
@@ -2673,12 +2684,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh t5, 736(sp)
 ; ZVFHMIN64-NEXT:    lh t6, 480(sp)
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 27
+; ZVFHMIN64-NEXT:    li a1, 28
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s5, 800(a0) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 26
+; ZVFHMIN64-NEXT:    li a1, 27
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s6, 800(a0) # 8-byte Folded Reload
@@ -2689,12 +2700,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh t5, 734(sp)
 ; ZVFHMIN64-NEXT:    lh t6, 478(sp)
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 25
+; ZVFHMIN64-NEXT:    li a1, 26
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s7, 800(a0) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 24
+; ZVFHMIN64-NEXT:    li a1, 25
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s8, 800(a0) # 8-byte Folded Reload
@@ -2705,12 +2716,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh t5, 732(sp)
 ; ZVFHMIN64-NEXT:    lh t6, 476(sp)
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 23
+; ZVFHMIN64-NEXT:    li a1, 24
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s4, 800(a0) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 22
+; ZVFHMIN64-NEXT:    li a1, 23
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s3, 800(a0) # 8-byte Folded Reload
@@ -2721,23 +2732,19 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh t6, 730(sp)
 ; ZVFHMIN64-NEXT:    lh s9, 474(sp)
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 21
+; ZVFHMIN64-NEXT:    li a1, 22
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s2, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    slli a1, a0, 1
-; ZVFHMIN64-NEXT:    add a0, a1, a0
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh t5, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v21
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, s9
 ; ZVFHMIN64-NEXT:    feq.h t6, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb t6, 173(sp)
 ; ZVFHMIN64-NEXT:    lh s9, 728(sp)
 ; ZVFHMIN64-NEXT:    lh s10, 472(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v21
-; ZVFHMIN64-NEXT:    vmv.x.s ra, v29
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v23
+; ZVFHMIN64-NEXT:    vmv.x.s ra, v9
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
 ; ZVFHMIN64-NEXT:    feq.h s9, fa5, fa4
@@ -2745,7 +2752,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh s9, 726(sp)
 ; ZVFHMIN64-NEXT:    lh s10, 470(sp)
 ; ZVFHMIN64-NEXT:    vmv.x.s a2, v19
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v27
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v3
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
 ; ZVFHMIN64-NEXT:    feq.h s9, fa5, fa4
@@ -2760,7 +2767,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb s10, 170(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 722(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 466(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s10, v9
+; ZVFHMIN64-NEXT:    vmv.x.s s10, v16
 ; ZVFHMIN64-NEXT:    vmv.x.s s11, v11
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
@@ -2844,27 +2851,27 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a1, 161(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 610(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 354(sp)
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 18
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s6, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s s5, v23
+; ZVFHMIN64-NEXT:    vmv.x.s s6, v29
+; ZVFHMIN64-NEXT:    vmv.x.s s5, v31
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 241(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 608(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 352(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s4, v31
-; ZVFHMIN64-NEXT:    lh s3, 800(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    lh s4, 800(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s3, v27
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 240(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 606(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 350(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s2, v7
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 19
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s2, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
@@ -2999,15 +3006,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 766(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 510(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 19
+; ZVFHMIN64-NEXT:    li a3, 20
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s s2, v8
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 14
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a3, a2, 4
+; ZVFHMIN64-NEXT:    sub a2, a3, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
@@ -3019,13 +3026,19 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 764(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 508(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 12
+; ZVFHMIN64-NEXT:    li a3, 13
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s t6, v8
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v16
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a3, a2, 1
+; ZVFHMIN64-NEXT:    add a2, a3, a2
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -3033,14 +3046,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 762(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 506(sp)
 ; ZVFHMIN64-NEXT:    csrr a3, vlenb
-; ZVFHMIN64-NEXT:    li a4, 6
-; ZVFHMIN64-NEXT:    mul a3, a3, a4
+; ZVFHMIN64-NEXT:    slli a4, a3, 3
+; ZVFHMIN64-NEXT:    sub a3, a4, a3
 ; ZVFHMIN64-NEXT:    add a3, sp, a3
 ; ZVFHMIN64-NEXT:    addi a3, a3, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
 ; ZVFHMIN64-NEXT:    csrr a4, vlenb
-; ZVFHMIN64-NEXT:    slli a4, a4, 2
+; ZVFHMIN64-NEXT:    slli s3, a4, 2
+; ZVFHMIN64-NEXT:    add a4, s3, a4
 ; ZVFHMIN64-NEXT:    add a4, sp, a4
 ; ZVFHMIN64-NEXT:    addi a4, a4, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
@@ -3052,14 +3066,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 760(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 504(sp)
 ; ZVFHMIN64-NEXT:    csrr s3, vlenb
-; ZVFHMIN64-NEXT:    li s4, 10
+; ZVFHMIN64-NEXT:    li s4, 11
 ; ZVFHMIN64-NEXT:    mul s3, s3, s4
 ; ZVFHMIN64-NEXT:    add s3, sp, s3
 ; ZVFHMIN64-NEXT:    addi s3, s3, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s s6, v8
 ; ZVFHMIN64-NEXT:    csrr s3, vlenb
-; ZVFHMIN64-NEXT:    slli s3, s3, 3
+; ZVFHMIN64-NEXT:    slli s4, s3, 3
+; ZVFHMIN64-NEXT:    add s3, s4, s3
 ; ZVFHMIN64-NEXT:    add s3, sp, s3
 ; ZVFHMIN64-NEXT:    addi s3, s3, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
@@ -3071,12 +3086,13 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 758(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 502(sp)
 ; ZVFHMIN64-NEXT:    csrr s3, vlenb
-; ZVFHMIN64-NEXT:    slli s3, s3, 4
+; ZVFHMIN64-NEXT:    slli s5, s3, 4
+; ZVFHMIN64-NEXT:    add s3, s5, s3
 ; ZVFHMIN64-NEXT:    add s3, sp, s3
 ; ZVFHMIN64-NEXT:    addi s3, s3, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s s5, v8
-; ZVFHMIN64-NEXT:    vmv.x.s s3, v2
+; ZVFHMIN64-NEXT:    vmv.x.s s3, v6
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
index d1980ee3b0a6f..d8dc1f3588633 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
@@ -56,8 +56,8 @@ define <4 x i64> @vsext_v4i64_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl)
 ; CHECK-LABEL: vsext_v4i64_v4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf8 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf8 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> %va, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
@@ -67,8 +67,8 @@ define <4 x i64> @vsext_v4i64_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vsext_v4i64_v4i8_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf8 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf8 v8, v10
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
@@ -104,8 +104,8 @@ define <4 x i64> @vsext_v4i64_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl
 ; CHECK-LABEL: vsext_v4i64_v4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf4 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf4 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> %va, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
@@ -115,8 +115,8 @@ define <4 x i64> @vsext_v4i64_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vsext_v4i64_v4i16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf4 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf4 v8, v10
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
@@ -128,8 +128,8 @@ define <4 x i64> @vsext_v4i64_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl
 ; CHECK-LABEL: vsext_v4i64_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf2 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
@@ -139,8 +139,8 @@ define <4 x i64> @vsext_v4i64_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vsext_v4i64_v4i32_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
@@ -153,24 +153,25 @@ define <32 x i64> @vsext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB12_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB12_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf2 v24, v8, v0.t
+; CHECK-NEXT:    vsext.vf2 v16, v8, v0.t
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf2 v16, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vsext.vf2 v24, v8, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv.v.v v16, v24
 ; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> %va, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
index 8d11ecacfa41e..c05f306424519 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
@@ -212,17 +212,17 @@ define <16 x i1> @v16i1_v8i1(<8 x i1>) {
 define <8 x i32> @v8i32_v4i32(<4 x i32>) {
 ; CHECK-LABEL: v8i32_v4i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI5_0)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v11, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    vslidedown.vx v10, v9, a0
+; CHECK-NEXT:    vslidedown.vx v8, v11, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v11, v8, v10
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vrgatherei16.vv v9, v10, v8
+; CHECK-NEXT:    vrgatherei16.vv v8, v10, v11
 ; CHECK-NEXT:    ret
   %2 = shufflevector <4 x i32> %0, <4 x i32> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 1, i32 2, i32 0, i32 3>
   ret <8 x i32> %2
@@ -275,59 +275,59 @@ define <16 x i32> @v16i32_v4i32(<4 x i32>) {
 define <32 x i32> @v32i32_v4i32(<4 x i32>) {
 ; CHECK-LABEL: v32i32_v4i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m4, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    lui a1, 135432
 ; CHECK-NEXT:    addi a1, a1, 1161
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, 270865
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 3
+; CHECK-NEXT:    vmv.v.i v8, 3
 ; CHECK-NEXT:    addi a1, a1, 548
-; CHECK-NEXT:    vmerge.vim v10, v10, 2, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 2, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, 100550
 ; CHECK-NEXT:    addi a1, a1, 64
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v10, v10, 0, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v10
+; CHECK-NEXT:    vsext.vf2 v12, v8
 ; CHECK-NEXT:    vslidedown.vx v20, v12, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v17, v8, v20
+; CHECK-NEXT:    vrgatherei16.vv v9, v16, v20
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v20, v20, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
+; CHECK-NEXT:    vrgatherei16.vv v8, v16, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v12, v20, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v18, v8, v20
+; CHECK-NEXT:    vrgatherei16.vv v10, v16, v20
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v24, v12, a1
+; CHECK-NEXT:    vslidedown.vx v20, v12, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v19, v8, v12
+; CHECK-NEXT:    vrgatherei16.vv v11, v16, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v24, a1
+; CHECK-NEXT:    vslidedown.vx v24, v20, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v20, v8, v24
+; CHECK-NEXT:    vrgatherei16.vv v12, v16, v20
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v24, v12, a1
+; CHECK-NEXT:    vslidedown.vx v20, v24, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v21, v8, v12
-; CHECK-NEXT:    vrgatherei16.vv v22, v8, v24
+; CHECK-NEXT:    vrgatherei16.vv v13, v16, v24
+; CHECK-NEXT:    vrgatherei16.vv v14, v16, v20
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v24, a1
+; CHECK-NEXT:    vslidedown.vx v20, v20, a1
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v23, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vrgatherei16.vv v15, v16, v20
 ; CHECK-NEXT:    ret
   %2 = shufflevector <4 x i32> %0, <4 x i32> poison, <32 x i32> <i32 2, i32 3, i32 0, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 2, i32 0, i32 3, i32 1, i32 1, i32 2, i32 0, i32 3, i32 1, i32 2, i32 0, i32 3, i32 1, i32 1, i32 2, i32 0, i32 3>
   ret <32 x i32> %2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index d06ce0a817f9e..9629b3547b3d0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -70,17 +70,17 @@ define <8 x i32> @m2_pair_swap_vl8(<8 x i32> %v1) vscale_range(2,2) {
 ; RV32-LABEL: m2_pair_swap_vl8:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vmv.v.i v12, 0
 ; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vwsubu.vx v12, v10, a0
+; RV32-NEXT:    vwsubu.vx v10, v12, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsrl.vv v12, v8, v12
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: m2_pair_swap_vl8:
@@ -183,17 +183,17 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a0, a0, 252
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v10, (a0)
 ; CHECK-NEXT:    vmv.v.i v0, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vslidedown.vi v10, v10, 1, v0.t
 ; CHECK-NEXT:    vmv.v.i v0, 5
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmerge.vvm v11, v11, v8, v0
+; CHECK-NEXT:    vmerge.vvm v9, v9, v10, v0
 ; CHECK-NEXT:    addi a0, a1, 672
-; CHECK-NEXT:    vs2r.v v10, (a0)
+; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    ret
   %1 = getelementptr i32, ptr %explicit_0, i64 63
   %2 = load <3 x i32>, ptr %1, align 1
@@ -338,20 +338,20 @@ define i64 @multi_chunks_shuffle(<32 x i32> %0) vscale_range(8,8) {
 ; RV32-LABEL: multi_chunks_shuffle:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vmv.v.i v12, 0
 ; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vwsubu.vx v12, v10, a0
+; RV32-NEXT:    vwsubu.vx v10, v12, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vand.vx v12, v12, a1
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsrl.vv v12, v8, v12
-; RV32-NEXT:    vsll.vv v8, v8, v10
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v12
 ; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
 ; RV32-NEXT:    vmerge.vvm v8, v10, v8, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
index c222626a166fe..a6d5fd3e0e51a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
@@ -134,15 +134,15 @@ define <64 x i1> @reverse_v64i1(<64 x i1> %a) {
 ; NO-ZVBB-NEXT:    slli a1, a1, 2
 ; NO-ZVBB-NEXT:    vmerge.vim v8, v8, 1, v0
 ; NO-ZVBB-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
-; NO-ZVBB-NEXT:    vrsub.vx v12, v12, a2
+; NO-ZVBB-NEXT:    vrsub.vx v16, v12, a2
 ; NO-ZVBB-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; NO-ZVBB-NEXT:    vrgatherei16.vv v19, v8, v12
-; NO-ZVBB-NEXT:    vrgatherei16.vv v18, v9, v12
-; NO-ZVBB-NEXT:    vrgatherei16.vv v17, v10, v12
-; NO-ZVBB-NEXT:    vrgatherei16.vv v16, v11, v12
+; NO-ZVBB-NEXT:    vrgatherei16.vv v15, v8, v16
+; NO-ZVBB-NEXT:    vrgatherei16.vv v14, v9, v16
+; NO-ZVBB-NEXT:    vrgatherei16.vv v13, v10, v16
+; NO-ZVBB-NEXT:    vrgatherei16.vv v12, v11, v16
 ; NO-ZVBB-NEXT:    addi a1, a1, -64
 ; NO-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; NO-ZVBB-NEXT:    vslidedown.vx v8, v16, a1
+; NO-ZVBB-NEXT:    vslidedown.vx v8, v12, a1
 ; NO-ZVBB-NEXT:    vmsne.vi v0, v8, 0
 ; NO-ZVBB-NEXT:    ret
 ;
@@ -161,14 +161,14 @@ define <128 x i1> @reverse_v128i1(<128 x i1> %a) {
 ; CHECK-NEXT:    li a0, 128
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vid.v v24
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    addi a2, a1, -1
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vrsub.vx v24, v8, a2
+; CHECK-NEXT:    vrsub.vx v24, v24, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v15, v16, v24
 ; CHECK-NEXT:    vrgatherei16.vv v14, v17, v24
@@ -281,16 +281,16 @@ define <64 x i8> @reverse_v64i8(<64 x i8> %a) {
 ; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
+; CHECK-NEXT:    vrsub.vx v16, v12, a1
 ; CHECK-NEXT:    addi a0, a0, -64
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v19, v8, v12
-; CHECK-NEXT:    vrgatherei16.vv v18, v9, v12
-; CHECK-NEXT:    vrgatherei16.vv v17, v10, v12
-; CHECK-NEXT:    vrgatherei16.vv v16, v11, v12
+; CHECK-NEXT:    vrgatherei16.vv v15, v8, v16
+; CHECK-NEXT:    vrgatherei16.vv v14, v9, v16
+; CHECK-NEXT:    vrgatherei16.vv v13, v10, v16
+; CHECK-NEXT:    vrgatherei16.vv v12, v11, v16
 ; CHECK-NEXT:    li a1, 64
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
+; CHECK-NEXT:    vslidedown.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <64 x i8> %a, <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <64 x i8> %res
@@ -356,12 +356,12 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) {
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v13, v8, v10
-; CHECK-NEXT:    vrgather.vv v12, v9, v10
+; CHECK-NEXT:    vrsub.vx v12, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v12
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <16 x i16> %res
@@ -377,14 +377,14 @@ define <32 x i16> @reverse_v32i16(<32 x i16> %a) {
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, -32
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v12
-; CHECK-NEXT:    vrgather.vv v18, v9, v12
-; CHECK-NEXT:    vrgather.vv v17, v10, v12
-; CHECK-NEXT:    vrgather.vv v16, v11, v12
+; CHECK-NEXT:    vrsub.vx v16, v12, a1
+; CHECK-NEXT:    vrgather.vv v15, v8, v16
+; CHECK-NEXT:    vrgather.vv v14, v9, v16
+; CHECK-NEXT:    vrgather.vv v13, v10, v16
+; CHECK-NEXT:    vrgather.vv v12, v11, v16
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
+; CHECK-NEXT:    vslidedown.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <32 x i16> %a, <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x i16> %res
@@ -438,12 +438,12 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) {
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v13, v8, v10
-; CHECK-NEXT:    vrgather.vv v12, v9, v10
+; CHECK-NEXT:    vrsub.vx v12, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v12
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i32> %res
@@ -499,12 +499,12 @@ define <4 x i64> @reverse_v4i64(<4 x i64> %a) {
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v13, v8, v10
-; CHECK-NEXT:    vrgather.vv v12, v9, v10
+; CHECK-NEXT:    vrsub.vx v12, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v12
 ; CHECK-NEXT:    addi a0, a0, -4
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x i64> %res
@@ -519,14 +519,14 @@ define <8 x i64> @reverse_v8i64(<8 x i64> %a) {
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v12
-; CHECK-NEXT:    vrgather.vv v18, v9, v12
-; CHECK-NEXT:    vrgather.vv v17, v10, v12
-; CHECK-NEXT:    vrgather.vv v16, v11, v12
+; CHECK-NEXT:    vrsub.vx v16, v12, a1
+; CHECK-NEXT:    vrgather.vv v15, v8, v16
+; CHECK-NEXT:    vrgather.vv v14, v9, v16
+; CHECK-NEXT:    vrgather.vv v13, v10, v16
+; CHECK-NEXT:    vrgather.vv v12, v11, v16
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
+; CHECK-NEXT:    vslidedown.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i64> %res
@@ -593,12 +593,12 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) {
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v13, v8, v10
-; CHECK-NEXT:    vrgather.vv v12, v9, v10
+; CHECK-NEXT:    vrsub.vx v12, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v12
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <16 x half> %a, <16 x half> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <16 x half> %res
@@ -614,14 +614,14 @@ define <32 x half> @reverse_v32f16(<32 x half> %a) {
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, -32
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v12
-; CHECK-NEXT:    vrgather.vv v18, v9, v12
-; CHECK-NEXT:    vrgather.vv v17, v10, v12
-; CHECK-NEXT:    vrgather.vv v16, v11, v12
+; CHECK-NEXT:    vrsub.vx v16, v12, a1
+; CHECK-NEXT:    vrgather.vv v15, v8, v16
+; CHECK-NEXT:    vrgather.vv v14, v9, v16
+; CHECK-NEXT:    vrgather.vv v13, v10, v16
+; CHECK-NEXT:    vrgather.vv v12, v11, v16
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
+; CHECK-NEXT:    vslidedown.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <32 x half> %a, <32 x half> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x half> %res
@@ -675,12 +675,12 @@ define <8 x float> @reverse_v8f32(<8 x float> %a) {
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v13, v8, v10
-; CHECK-NEXT:    vrgather.vv v12, v9, v10
+; CHECK-NEXT:    vrsub.vx v12, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v12
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x float> %res
@@ -736,12 +736,12 @@ define <4 x double> @reverse_v4f64(<4 x double> %a) {
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v13, v8, v10
-; CHECK-NEXT:    vrgather.vv v12, v9, v10
+; CHECK-NEXT:    vrsub.vx v12, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v12
 ; CHECK-NEXT:    addi a0, a0, -4
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x double> %res
@@ -756,14 +756,14 @@ define <8 x double> @reverse_v8f64(<8 x double> %a) {
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v12
-; CHECK-NEXT:    vrgather.vv v18, v9, v12
-; CHECK-NEXT:    vrgather.vv v17, v10, v12
-; CHECK-NEXT:    vrgather.vv v16, v11, v12
+; CHECK-NEXT:    vrsub.vx v16, v12, a1
+; CHECK-NEXT:    vrgather.vv v15, v8, v16
+; CHECK-NEXT:    vrgather.vv v14, v9, v16
+; CHECK-NEXT:    vrgather.vv v13, v10, v16
+; CHECK-NEXT:    vrgather.vv v12, v11, v16
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
+; CHECK-NEXT:    vslidedown.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x double> %a, <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x double> %res
@@ -871,12 +871,12 @@ define <32 x i8> @reverse_v32i8_2(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrsub.vi v10, v10, 15
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10
+; CHECK-NEXT:    vrsub.vi v11, v10, 15
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vrgather.vv v8, v9, v11
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 16
+; CHECK-NEXT:    vslideup.vi v8, v10, 16
 ; CHECK-NEXT:    ret
   %res = shufflevector <16 x i8> %a, <16 x i8> %b,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x i8> %res
@@ -926,11 +926,11 @@ define <16 x i16> @reverse_v16i16_2(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrsub.vi v10, v10, 7
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10
+; CHECK-NEXT:    vrsub.vi v11, v10, 7
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vrgather.vv v8, v9, v11
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vslideup.vi v8, v10, 8
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i16> %a, <8 x i16> %b,  <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <16 x i16> %res
@@ -945,14 +945,14 @@ define <32 x i16> @reverse_v32i16_2(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v12
-; CHECK-NEXT:    vrgather.vv v17, v10, v12
-; CHECK-NEXT:    vrgather.vv v14, v9, v12
-; CHECK-NEXT:    vrgather.vv v16, v11, v12
+; CHECK-NEXT:    vrsub.vx v16, v12, a1
+; CHECK-NEXT:    vrgather.vv v13, v8, v16
+; CHECK-NEXT:    vrgather.vv v15, v10, v16
+; CHECK-NEXT:    vrgather.vv v12, v9, v16
+; CHECK-NEXT:    vrgather.vv v14, v11, v16
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v14, a0
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
+; CHECK-NEXT:    vslidedown.vx v12, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v14, a0
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 16
@@ -990,11 +990,11 @@ define <8 x i32> @reverse_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrsub.vi v10, v10, 3
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10
+; CHECK-NEXT:    vrsub.vi v11, v10, 3
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vrgather.vv v8, v9, v11
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 4
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i32> %res
@@ -1010,14 +1010,14 @@ define <16 x i32> @reverse_v16i32_2(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, -8
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v12
-; CHECK-NEXT:    vrgather.vv v17, v10, v12
-; CHECK-NEXT:    vrgather.vv v14, v9, v12
-; CHECK-NEXT:    vrgather.vv v16, v11, v12
+; CHECK-NEXT:    vrsub.vx v16, v12, a1
+; CHECK-NEXT:    vrgather.vv v13, v8, v16
+; CHECK-NEXT:    vrgather.vv v15, v10, v16
+; CHECK-NEXT:    vrgather.vv v12, v9, v16
+; CHECK-NEXT:    vrgather.vv v14, v11, v16
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v14, a0
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
+; CHECK-NEXT:    vslidedown.vx v12, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v14, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 8
 ; CHECK-NEXT:    ret
@@ -1035,20 +1035,20 @@ define <32 x i32> @reverse_v32i32_2(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    vrsub.vx v24, v16, a1
-; CHECK-NEXT:    vrgather.vv v23, v8, v24
-; CHECK-NEXT:    vrgather.vv v19, v12, v24
-; CHECK-NEXT:    vrgather.vv v22, v9, v24
-; CHECK-NEXT:    vrgather.vv v18, v13, v24
-; CHECK-NEXT:    vrgather.vv v21, v10, v24
-; CHECK-NEXT:    vrgather.vv v17, v14, v24
-; CHECK-NEXT:    vrgather.vv v20, v11, v24
-; CHECK-NEXT:    vrgather.vv v16, v15, v24
+; CHECK-NEXT:    vrgather.vv v19, v8, v24
+; CHECK-NEXT:    vrgather.vv v23, v12, v24
+; CHECK-NEXT:    vrgather.vv v18, v9, v24
+; CHECK-NEXT:    vrgather.vv v22, v13, v24
+; CHECK-NEXT:    vrgather.vv v17, v10, v24
+; CHECK-NEXT:    vrgather.vv v21, v14, v24
+; CHECK-NEXT:    vrgather.vv v16, v11, v24
+; CHECK-NEXT:    vrgather.vv v20, v15, v24
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v24, v20, a0
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
+; CHECK-NEXT:    vslidedown.vx v16, v16, a0
+; CHECK-NEXT:    vslidedown.vx v8, v20, a0
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v24, 16
+; CHECK-NEXT:    vslideup.vi v8, v16, 16
 ; CHECK-NEXT:    ret
   %res = shufflevector <16 x i32> %a, <16 x i32> %b,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x i32> %res
@@ -1079,14 +1079,14 @@ define <8 x i64> @reverse_v8i64_2(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, -4
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v12
-; CHECK-NEXT:    vrgather.vv v17, v10, v12
-; CHECK-NEXT:    vrgather.vv v14, v9, v12
-; CHECK-NEXT:    vrgather.vv v16, v11, v12
+; CHECK-NEXT:    vrsub.vx v16, v12, a1
+; CHECK-NEXT:    vrgather.vv v13, v8, v16
+; CHECK-NEXT:    vrgather.vv v15, v10, v16
+; CHECK-NEXT:    vrgather.vv v12, v9, v16
+; CHECK-NEXT:    vrgather.vv v14, v11, v16
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v14, a0
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
+; CHECK-NEXT:    vslidedown.vx v12, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v14, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 4
 ; CHECK-NEXT:    ret
@@ -1138,11 +1138,11 @@ define <16 x half> @reverse_v16f16_2(<8 x half> %a, <8 x half> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrsub.vi v10, v10, 7
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10
+; CHECK-NEXT:    vrsub.vi v11, v10, 7
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vrgather.vv v8, v9, v11
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vslideup.vi v8, v10, 8
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x half> %a, <8 x half> %b,  <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <16 x half> %res
@@ -1200,11 +1200,11 @@ define <8 x float> @reverse_v8f32_2(<4 x float> %a, <4 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrsub.vi v10, v10, 3
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10
+; CHECK-NEXT:    vrsub.vi v11, v10, 3
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vrgather.vv v8, v9, v11
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 4
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x float> %res
@@ -1220,14 +1220,14 @@ define <16 x float> @reverse_v16f32_2(<8 x float> %a, <8 x float> %b) {
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, -8
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v12
-; CHECK-NEXT:    vrgather.vv v17, v10, v12
-; CHECK-NEXT:    vrgather.vv v14, v9, v12
-; CHECK-NEXT:    vrgather.vv v16, v11, v12
+; CHECK-NEXT:    vrsub.vx v16, v12, a1
+; CHECK-NEXT:    vrgather.vv v13, v8, v16
+; CHECK-NEXT:    vrgather.vv v15, v10, v16
+; CHECK-NEXT:    vrgather.vv v12, v9, v16
+; CHECK-NEXT:    vrgather.vv v14, v11, v16
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v14, a0
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
+; CHECK-NEXT:    vslidedown.vx v12, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v14, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 8
 ; CHECK-NEXT:    ret
@@ -1260,14 +1260,14 @@ define <8 x double> @reverse_v8f64_2(<4 x double> %a, <4 x double> %b) {
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, -4
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v12
-; CHECK-NEXT:    vrgather.vv v17, v10, v12
-; CHECK-NEXT:    vrgather.vv v14, v9, v12
-; CHECK-NEXT:    vrgather.vv v16, v11, v12
+; CHECK-NEXT:    vrsub.vx v16, v12, a1
+; CHECK-NEXT:    vrgather.vv v13, v8, v16
+; CHECK-NEXT:    vrgather.vv v15, v10, v16
+; CHECK-NEXT:    vrgather.vv v12, v9, v16
+; CHECK-NEXT:    vrgather.vv v14, v11, v16
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v14, a0
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
+; CHECK-NEXT:    vslidedown.vx v12, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v14, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 4
 ; CHECK-NEXT:    ret
@@ -1340,12 +1340,12 @@ define <8 x i32> @reverse_v8i32_undef_suffix(<8 x i32> %a) {
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v13, v8, v10
-; CHECK-NEXT:    vrgather.vv v12, v9, v10
+; CHECK-NEXT:    vrsub.vx v12, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v12
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i32> %res
@@ -1360,12 +1360,12 @@ define <8 x i32> @reverse_v8i32_undef_prefix(<8 x i32> %a) {
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v13, v8, v10
-; CHECK-NEXT:    vrgather.vv v12, v9, v10
+; CHECK-NEXT:    vrsub.vx v12, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v12
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i32> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
index 5b8e312a06ad4..fe2072990e2ac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
@@ -615,17 +615,17 @@ define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) {
 ; RV32-LABEL: shuffle_v8i32_as_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vmv.v.i v12, 0
 ; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vwsubu.vx v12, v10, a0
+; RV32-NEXT:    vwsubu.vx v10, v12, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsrl.vv v12, v8, v12
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8i32_as_i64:
@@ -819,17 +819,17 @@ define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) {
 ; RV32-LABEL: shuffle_v8f32_as_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vmv.v.i v12, 0
 ; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vwsubu.vx v12, v10, a0
+; RV32-NEXT:    vwsubu.vx v10, v12, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsrl.vv v12, v8, v12
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8f32_as_i64:
@@ -864,17 +864,17 @@ define <8 x float> @shuffle_v8f32_as_i64_exact(<8 x float> %v) vscale_range(2,2)
 ; RV32-LABEL: shuffle_v8f32_as_i64_exact:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vmv.v.i v12, 0
 ; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vwsubu.vx v12, v10, a0
+; RV32-NEXT:    vwsubu.vx v10, v12, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsrl.vv v12, v8, v12
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8f32_as_i64_exact:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
index 0928a47757430..9279e0a4d3a6c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
@@ -559,10 +559,10 @@ define void @vnsrl_0_i32_single_src_m8_2(ptr %in, ptr %out) {
 ; ZVE32F-NEXT:    lui a0, 349525
 ; ZVE32F-NEXT:    addi a0, a0, 1365
 ; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; ZVE32F-NEXT:    vmv.v.x v16, a0
+; ZVE32F-NEXT:    vmv.v.x v24, a0
 ; ZVE32F-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVE32F-NEXT:    vcompress.vm v24, v8, v16
-; ZVE32F-NEXT:    vse32.v v24, (a1)
+; ZVE32F-NEXT:    vcompress.vm v16, v8, v24
+; ZVE32F-NEXT:    vse32.v v16, (a1)
 ; ZVE32F-NEXT:    ret
 entry:
   %0 = load <64 x i32>, ptr %in, align 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
index a2d41de5d1853..9f1f98893d04c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
@@ -345,8 +345,8 @@ define <4 x double> @vsitofp_v4f64_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-LABEL: vsitofp_v4f64_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
   ret <4 x double> %v
@@ -356,8 +356,8 @@ define <4 x double> @vsitofp_v4f64_v4i32_unmasked(<4 x i32> %va, i32 zeroext %ev
 ; CHECK-LABEL: vsitofp_v4f64_v4i32_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
index a0d5d2ccc848d..b72e3cfcb920a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
@@ -345,8 +345,8 @@ define <4 x double> @vuitofp_v4f64_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-LABEL: vuitofp_v4f64_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
   ret <4 x double> %v
@@ -356,8 +356,8 @@ define <4 x double> @vuitofp_v4f64_v4i32_unmasked(<4 x i32> %va, i32 zeroext %ev
 ; CHECK-LABEL: vuitofp_v4f64_v4i32_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
index 6d9f69f436fc4..29fbb8acc3358 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
@@ -365,13 +365,13 @@ define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m)
 ; RV64-SLOW-NEXT:    beqz a0, .LBB6_4
 ; RV64-SLOW-NEXT:  .LBB6_8: # %cond.store5
 ; RV64-SLOW-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV64-SLOW-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-SLOW-NEXT:    vslidedown.vi v12, v8, 3
 ; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-SLOW-NEXT:    vslidedown.vi v10, v10, 3
+; RV64-SLOW-NEXT:    vslidedown.vi v8, v10, 3
 ; RV64-SLOW-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64-SLOW-NEXT:    vmv.x.s a0, v8
+; RV64-SLOW-NEXT:    vmv.x.s a0, v12
 ; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-SLOW-NEXT:    vmv.x.s a1, v10
+; RV64-SLOW-NEXT:    vmv.x.s a1, v8
 ; RV64-SLOW-NEXT:    srli a2, a0, 8
 ; RV64-SLOW-NEXT:    sb a0, 0(a1)
 ; RV64-SLOW-NEXT:    sb a2, 1(a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll
index f80c158324684..8a8fe234cacd1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll
@@ -278,12 +278,12 @@ define <8 x half> @vfadd_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -304,12 +304,12 @@ define <8 x half> @vfadd_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -374,12 +374,12 @@ define <16 x half> @vfadd_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -400,12 +400,12 @@ define <16 x half> @vfadd_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll
index 23baa60de1532..30f509436214a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll
@@ -278,12 +278,12 @@ define <8 x half> @vfdiv_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -304,12 +304,12 @@ define <8 x half> @vfdiv_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -374,12 +374,12 @@ define <16 x half> @vfdiv_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -400,12 +400,12 @@ define <16 x half> @vfdiv_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll
index 11420a23285d0..c37df892de442 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll
@@ -188,11 +188,11 @@ define <8 x half> @vfmax_v8f16_vf(<8 x half> %a, half %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmax.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfmax.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -213,11 +213,11 @@ define <8 x half> @vfmax_v8f16_fv(<8 x half> %a, half %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmax.vv v10, v12, v10
+; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -261,11 +261,11 @@ define <16 x half> @vfmax_v16f16_vf(<16 x half> %a, half %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmax.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfmax.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -286,11 +286,11 @@ define <16 x half> @vfmax_v16f16_fv(<16 x half> %a, half %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmax.vv v12, v16, v12
+; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll
index e8ae32a45f7cd..90afe36a36c0f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll
@@ -188,11 +188,11 @@ define <8 x half> @vfmin_v8f16_vf(<8 x half> %a, half %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmin.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfmin.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -213,11 +213,11 @@ define <8 x half> @vfmin_v8f16_fv(<8 x half> %a, half %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmin.vv v10, v12, v10
+; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -261,11 +261,11 @@ define <16 x half> @vfmin_v16f16_vf(<16 x half> %a, half %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmin.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfmin.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -286,11 +286,11 @@ define <16 x half> @vfmin_v16f16_fv(<16 x half> %a, half %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmin.vv v12, v16, v12
+; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll
index 7e03a3cf95577..3c0819e549552 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll
@@ -278,12 +278,12 @@ define <8 x half> @vfmul_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -304,12 +304,12 @@ define <8 x half> @vfmul_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -374,12 +374,12 @@ define <16 x half> @vfmul_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -400,12 +400,12 @@ define <16 x half> @vfmul_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfpext-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfpext-constrained-sdnode.ll
index 59fd8bbd17953..b8a6be40b3f32 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfpext-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfpext-constrained-sdnode.ll
@@ -59,8 +59,8 @@ define <8 x float> @vfpext_v8f16_v8f32(<8 x half> %va) strictfp {
 ; CHECK-LABEL: vfpext_v8f16_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half> %va, metadata !"fpexcept.strict")
   ret <8 x float> %evec
@@ -96,8 +96,8 @@ define <4 x double> @vfpext_v4f32_v4f64(<4 x float> %va) strictfp {
 ; CHECK-LABEL: vfpext_v4f32_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f32(<4 x float> %va, metadata !"fpexcept.strict")
   ret <4 x double> %evec
@@ -108,8 +108,8 @@ define <8 x double> @vfpext_v8f32_v8f64(<8 x float> %va) strictfp {
 ; CHECK-LABEL: vfpext_v8f32_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f32(<8 x float> %va, metadata !"fpexcept.strict")
   ret <8 x double> %evec
@@ -170,8 +170,8 @@ define <8 x float> @vfpext_v8bf16_v8f32(<8 x bfloat> %va) strictfp {
 ; CHECK-LABEL: vfpext_v8bf16_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8bf16(<8 x bfloat> %va, metadata !"fpexcept.strict")
   ret <8 x float> %evec
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptoi-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptoi-constrained-sdnode.ll
index 5ac0d8d120cba..ac58a597a0812 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptoi-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptoi-constrained-sdnode.ll
@@ -485,8 +485,8 @@ define <8 x i32> @vfptosi_v8f16_v8i32(<8 x half> %va) strictfp {
 ; CHECK-LABEL: vfptosi_v8f16_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f16(<8 x half> %va, metadata !"fpexcept.strict")
   ret <8 x i32> %evec
@@ -497,8 +497,8 @@ define <8 x i32> @vfptoui_v8f16_v8i32(<8 x half> %va) strictfp {
 ; CHECK-LABEL: vfptoui_v8f16_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f16(<8 x half> %va, metadata !"fpexcept.strict")
   ret <8 x i32> %evec
@@ -607,8 +607,8 @@ define <16 x i32> @vfptosi_v16f16_v16i32(<16 x half> %va) strictfp {
 ; CHECK-LABEL: vfptosi_v16f16_v16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <16 x i32> @llvm.experimental.constrained.fptosi.v16i32.v16f16(<16 x half> %va, metadata !"fpexcept.strict")
   ret <16 x i32> %evec
@@ -619,8 +619,8 @@ define <16 x i32> @vfptoui_v16f16_v16i32(<16 x half> %va) strictfp {
 ; CHECK-LABEL: vfptoui_v16f16_v16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <16 x i32> @llvm.experimental.constrained.fptoui.v16i32.v16f16(<16 x half> %va, metadata !"fpexcept.strict")
   ret <16 x i32> %evec
@@ -1051,8 +1051,8 @@ define <4 x i64> @vfptosi_v4f32_v4i64(<4 x float> %va) strictfp {
 ; CHECK-LABEL: vfptosi_v4f32_v4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f32(<4 x float> %va, metadata !"fpexcept.strict")
   ret <4 x i64> %evec
@@ -1063,8 +1063,8 @@ define <4 x i64> @vfptoui_v4f32_v4i64(<4 x float> %va) strictfp {
 ; CHECK-LABEL: vfptoui_v4f32_v4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f32(<4 x float> %va, metadata !"fpexcept.strict")
   ret <4 x i64> %evec
@@ -1173,8 +1173,8 @@ define <8 x i64> @vfptosi_v8f32_v8i64(<8 x float> %va) strictfp {
 ; CHECK-LABEL: vfptosi_v8f32_v8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <8 x i64> @llvm.experimental.constrained.fptosi.v8i64.v8f32(<8 x float> %va, metadata !"fpexcept.strict")
   ret <8 x i64> %evec
@@ -1185,8 +1185,8 @@ define <8 x i64> @vfptoui_v8f32_v8i64(<8 x float> %va) strictfp {
 ; CHECK-LABEL: vfptoui_v8f32_v8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <8 x i64> @llvm.experimental.constrained.fptoui.v8i64.v8f32(<8 x float> %va, metadata !"fpexcept.strict")
   ret <8 x i64> %evec
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll
index 58a510047d625..0f3a6de4f4a90 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll
@@ -278,12 +278,12 @@ define <8 x half> @vfsub_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -304,12 +304,12 @@ define <8 x half> @vfsub_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -374,12 +374,12 @@ define <16 x half> @vfsub_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -400,12 +400,12 @@ define <16 x half> @vfsub_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
index a48be7687106e..a9e9b757f372e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
@@ -1868,11 +1868,11 @@ define <8 x double> @vfwmacc_vf_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, ha
 ; CHECK-LABEL: vfwmacc_vf_v8f64_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.v.f v13, fa0
+; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vfwcvt.f.f.v v14, v12
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v13
+; CHECK-NEXT:    vfwcvt.f.f.v v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwmacc.vv v8, v14, v16
+; CHECK-NEXT:    vfwmacc.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x half> poison, half %c, i32 0
   %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -1903,11 +1903,11 @@ define <8 x double> @vfwnmacc_vf_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h
 ; CHECK-LABEL: vfwnmacc_vf_v8f64_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.v.f v13, fa0
+; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vfwcvt.f.f.v v14, v12
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v13
+; CHECK-NEXT:    vfwcvt.f.f.v v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwnmacc.vv v8, v14, v16
+; CHECK-NEXT:    vfwnmacc.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x half> poison, half %c, i32 0
   %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -1923,11 +1923,11 @@ define <8 x double> @vfwnmacc_fv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h
 ; CHECK-LABEL: vfwnmacc_fv_v8f64_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.v.f v13, fa0
+; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vfwcvt.f.f.v v14, v12
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v13
+; CHECK-NEXT:    vfwcvt.f.f.v v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwnmacc.vv v8, v14, v16
+; CHECK-NEXT:    vfwnmacc.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x half> poison, half %c, i32 0
   %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -1959,11 +1959,11 @@ define <8 x double> @vfwmsac_vf_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, ha
 ; CHECK-LABEL: vfwmsac_vf_v8f64_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.v.f v13, fa0
+; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vfwcvt.f.f.v v14, v12
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v13
+; CHECK-NEXT:    vfwcvt.f.f.v v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwmsac.vv v8, v14, v16
+; CHECK-NEXT:    vfwmsac.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x half> poison, half %c, i32 0
   %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -1994,11 +1994,11 @@ define <8 x double> @vfwnmsac_vf_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h
 ; CHECK-LABEL: vfwnmsac_vf_v8f64_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.v.f v13, fa0
+; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vfwcvt.f.f.v v14, v12
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v13
+; CHECK-NEXT:    vfwcvt.f.f.v v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwnmsac.vv v8, v14, v16
+; CHECK-NEXT:    vfwnmsac.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x half> poison, half %c, i32 0
   %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -2013,11 +2013,11 @@ define <8 x double> @vfwnmsac_fv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h
 ; CHECK-LABEL: vfwnmsac_fv_v8f64_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.v.f v13, fa0
+; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vfwcvt.f.f.v v14, v12
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v13
+; CHECK-NEXT:    vfwcvt.f.f.v v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwnmsac.vv v8, v14, v16
+; CHECK-NEXT:    vfwnmsac.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x half> poison, half %c, i32 0
   %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vitofp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vitofp-constrained-sdnode.ll
index 5eb54fc7e299a..ce5483e3ae8cc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vitofp-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vitofp-constrained-sdnode.ll
@@ -725,8 +725,8 @@ define <16 x half> @vsitofp_v16i8_v16f16(<16 x i8> %va) strictfp {
 ; CHECK-LABEL: vsitofp_v16i8_v16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i8(<16 x i8> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <16 x half> %evec
@@ -737,8 +737,8 @@ define <16 x half> @vuitofp_v16i8_v16f16(<16 x i8> %va) strictfp {
 ; CHECK-LABEL: vuitofp_v16i8_v16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i8(<16 x i8> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <16 x half> %evec
@@ -772,10 +772,11 @@ declare <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i8(<32 x i8>
 define <32 x half> @vsitofp_v32i8_v32f16(<32 x i8> %va) strictfp {
 ; CHECK-LABEL: vsitofp_v32i8_v32f16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i8(<32 x i8> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <32 x half> %evec
@@ -785,10 +786,11 @@ declare <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i8(<32 x i8>
 define <32 x half> @vuitofp_v32i8_v32f16(<32 x i8> %va) strictfp {
 ; CHECK-LABEL: vuitofp_v32i8_v32f16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i8(<32 x i8> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <32 x half> %evec
@@ -1031,8 +1033,8 @@ define <8 x float> @vsitofp_v8i16_v8f32(<8 x i16> %va) strictfp {
 ; CHECK-LABEL: vsitofp_v8i16_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <8 x float> @llvm.experimental.constrained.sitofp.v8f32.v8i16(<8 x i16> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x float> %evec
@@ -1043,8 +1045,8 @@ define <8 x float> @vuitofp_v8i16_v8f32(<8 x i16> %va) strictfp {
 ; CHECK-LABEL: vuitofp_v8i16_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <8 x float> @llvm.experimental.constrained.uitofp.v8f32.v8i16(<8 x i16> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x float> %evec
@@ -1101,8 +1103,8 @@ define <16 x float> @vsitofp_v16i16_v16f32(<16 x i16> %va) strictfp {
 ; CHECK-LABEL: vsitofp_v16i16_v16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <16 x float> @llvm.experimental.constrained.sitofp.v16f32.v16i16(<16 x i16> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <16 x float> %evec
@@ -1113,8 +1115,8 @@ define <16 x float> @vuitofp_v16i16_v16f32(<16 x i16> %va) strictfp {
 ; CHECK-LABEL: vuitofp_v16i16_v16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <16 x float> @llvm.experimental.constrained.uitofp.v16f32.v16i16(<16 x i16> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <16 x float> %evec
@@ -1335,8 +1337,8 @@ define <4 x double> @vsitofp_v4i32_v4f64(<4 x i32> %va) strictfp {
 ; CHECK-LABEL: vsitofp_v4i32_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <4 x double> @llvm.experimental.constrained.sitofp.v4f64.v4i32(<4 x i32> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x double> %evec
@@ -1347,8 +1349,8 @@ define <4 x double> @vuitofp_v4i32_v4f64(<4 x i32> %va) strictfp {
 ; CHECK-LABEL: vuitofp_v4i32_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <4 x double> @llvm.experimental.constrained.uitofp.v4f64.v4i32(<4 x i32> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x double> %evec
@@ -1405,8 +1407,8 @@ define <8 x double> @vsitofp_v8i32_v8f64(<8 x i32> %va) strictfp {
 ; CHECK-LABEL: vsitofp_v8i32_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <8 x double> @llvm.experimental.constrained.sitofp.v8f64.v8i32(<8 x i32> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x double> %evec
@@ -1417,8 +1419,8 @@ define <8 x double> @vuitofp_v8i32_v8f64(<8 x i32> %va) strictfp {
 ; CHECK-LABEL: vuitofp_v8i32_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <8 x double> @llvm.experimental.constrained.uitofp.v8f64.v8i32(<8 x i32> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x double> %evec
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
index 64f1819f0e329..83a195a66a502 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
@@ -875,8 +875,8 @@ define <4 x i64> @vpgather_v4i64(<4 x ptr> %ptrs, <4 x i1> %m, i32 zeroext %evl)
 ; RV32-LABEL: vpgather_v4i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vluxei32.v v10, (zero), v8, v0.t
-; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    vmv1r.v v10, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_v4i64:
@@ -892,8 +892,8 @@ define <4 x i64> @vpgather_truemask_v4i64(<4 x ptr> %ptrs, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_truemask_v4i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vluxei32.v v10, (zero), v8
-; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    vmv1r.v v10, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_truemask_v4i64:
@@ -911,8 +911,8 @@ define <8 x i64> @vpgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, i32 zeroext %evl)
 ; RV32-LABEL: vpgather_v8i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vluxei32.v v12, (zero), v8, v0.t
-; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_v8i64:
@@ -1757,8 +1757,8 @@ define <4 x double> @vpgather_v4f64(<4 x ptr> %ptrs, <4 x i1> %m, i32 zeroext %e
 ; RV32-LABEL: vpgather_v4f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vluxei32.v v10, (zero), v8, v0.t
-; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    vmv1r.v v10, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_v4f64:
@@ -1774,8 +1774,8 @@ define <4 x double> @vpgather_truemask_v4f64(<4 x ptr> %ptrs, i32 zeroext %evl)
 ; RV32-LABEL: vpgather_truemask_v4f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vluxei32.v v10, (zero), v8
-; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    vmv1r.v v10, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_truemask_v4f64:
@@ -1793,8 +1793,8 @@ define <8 x double> @vpgather_v8f64(<8 x ptr> %ptrs, <8 x i1> %m, i32 zeroext %e
 ; RV32-LABEL: vpgather_v8f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vluxei32.v v12, (zero), v8, v0.t
-; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_v8f64:
@@ -2117,12 +2117,12 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV64-LABEL: vpgather_baseidx_v32i8_v32f64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 16
+; RV64-NEXT:    vslidedown.vi v16, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v24, v8
 ; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsext.vf8 v16, v10
-; RV64-NEXT:    vsll.vi v16, v16, 3
+; RV64-NEXT:    vsext.vf8 v8, v16
+; RV64-NEXT:    vsll.vi v16, v8, 3
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB95_2
@@ -2177,11 +2177,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 16
+; RV64-NEXT:    vslidedown.vi v16, v8, 16
 ; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf8 v16, v8
-; RV64-NEXT:    vsll.vi v16, v16, 3
+; RV64-NEXT:    vsext.vf8 v8, v16
+; RV64-NEXT:    vsll.vi v16, v8, 3
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB96_2
@@ -2293,12 +2293,12 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV64-LABEL: vpgather_baseidx_v32i16_v32f64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 16
+; RV64-NEXT:    vslidedown.vi v16, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v24, v8
 ; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsext.vf4 v16, v12
-; RV64-NEXT:    vsll.vi v16, v16, 3
+; RV64-NEXT:    vsext.vf4 v8, v16
+; RV64-NEXT:    vsll.vi v16, v8, 3
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB98_2
@@ -2353,11 +2353,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 16
+; RV64-NEXT:    vslidedown.vi v16, v8, 16
 ; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf4 v16, v8
-; RV64-NEXT:    vsll.vi v16, v16, 3
+; RV64-NEXT:    vsext.vf4 v8, v16
+; RV64-NEXT:    vsll.vi v16, v8, 3
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB99_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
index 5975b0d0761eb..32ae81926bbee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
@@ -1016,12 +1016,12 @@ define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v12, 0
-; RV32-NEXT:    vwsub.vx v14, v12, a0
+; RV32-NEXT:    vmv.v.i v14, 0
+; RV32-NEXT:    vwsub.vx v12, v14, a0
 ; RV32-NEXT:    li a0, 63
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vx v10, v10, a0
-; RV32-NEXT:    vand.vx v12, v14, a0
+; RV32-NEXT:    vand.vx v12, v12, a0
 ; RV32-NEXT:    vsrl.vv v12, v8, v12
 ; RV32-NEXT:    vsll.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v12
@@ -1081,12 +1081,12 @@ define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) {
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vwsub.vx v20, v16, a0
+; RV32-NEXT:    vmv.v.i v20, 0
+; RV32-NEXT:    vwsub.vx v16, v20, a0
 ; RV32-NEXT:    li a0, 63
 ; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vx v12, v12, a0
-; RV32-NEXT:    vand.vx v16, v20, a0
+; RV32-NEXT:    vand.vx v16, v16, a0
 ; RV32-NEXT:    vsrl.vv v16, v8, v16
 ; RV32-NEXT:    vsll.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
index 68a9e217ccd1c..ec22d2be1eaad 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
@@ -1833,12 +1833,12 @@ define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-RV32-NEXT:    vmv.v.x v10, a0
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v12, 0
-; CHECK-RV32-NEXT:    vwsub.vx v14, v12, a0
+; CHECK-RV32-NEXT:    vmv.v.i v14, 0
+; CHECK-RV32-NEXT:    vwsub.vx v12, v14, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
-; CHECK-RV32-NEXT:    vand.vx v12, v14, a0
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
@@ -1872,17 +1872,17 @@ define <4 x i64> @vror_vi_v4i64(<4 x i64> %a) {
 ; CHECK-RV32-LABEL: vror_vi_v4i64:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v10, 0
+; CHECK-RV32-NEXT:    vmv.v.i v12, 0
 ; CHECK-RV32-NEXT:    li a0, 1
-; CHECK-RV32-NEXT:    vwsubu.vx v12, v10, a0
+; CHECK-RV32-NEXT:    vwsubu.vx v10, v12, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v10, a0
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
-; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vmv.v.x v12, a0
+; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
+; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
+; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
+; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_v4i64:
@@ -1907,17 +1907,17 @@ define <4 x i64> @vror_vi_rotl_v4i64(<4 x i64> %a) {
 ; CHECK-RV32-LABEL: vror_vi_rotl_v4i64:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v10, 0
+; CHECK-RV32-NEXT:    vmv.v.i v12, 0
 ; CHECK-RV32-NEXT:    li a0, 1
-; CHECK-RV32-NEXT:    vwsubu.vx v12, v10, a0
+; CHECK-RV32-NEXT:    vwsubu.vx v10, v12, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v10, a0
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
-; CHECK-RV32-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vmv.v.x v12, a0
+; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
+; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
+; CHECK-RV32-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-RV32-NEXT:    vsll.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_rotl_v4i64:
@@ -1969,12 +1969,12 @@ define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-RV32-NEXT:    vmv.v.x v12, a0
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v16, 0
-; CHECK-RV32-NEXT:    vwsub.vx v20, v16, a0
+; CHECK-RV32-NEXT:    vmv.v.i v20, 0
+; CHECK-RV32-NEXT:    vwsub.vx v16, v20, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV32-NEXT:    vand.vx v16, v20, a0
+; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
 ; CHECK-RV32-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
@@ -2008,17 +2008,17 @@ define <8 x i64> @vror_vi_v8i64(<8 x i64> %a) {
 ; CHECK-RV32-LABEL: vror_vi_v8i64:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v12, 0
+; CHECK-RV32-NEXT:    vmv.v.i v16, 0
 ; CHECK-RV32-NEXT:    li a0, 1
-; CHECK-RV32-NEXT:    vwsubu.vx v16, v12, a0
+; CHECK-RV32-NEXT:    vwsubu.vx v12, v16, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v12, a0
-; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
-; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
-; CHECK-RV32-NEXT:    vsll.vv v16, v8, v16
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
+; CHECK-RV32-NEXT:    vmv.v.x v16, a0
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
+; CHECK-RV32-NEXT:    vand.vi v16, v16, 1
+; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
+; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v16
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_v8i64:
@@ -2043,17 +2043,17 @@ define <8 x i64> @vror_vi_rotl_v8i64(<8 x i64> %a) {
 ; CHECK-RV32-LABEL: vror_vi_rotl_v8i64:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v12, 0
+; CHECK-RV32-NEXT:    vmv.v.i v16, 0
 ; CHECK-RV32-NEXT:    li a0, 1
-; CHECK-RV32-NEXT:    vwsubu.vx v16, v12, a0
+; CHECK-RV32-NEXT:    vwsubu.vx v12, v16, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v12, a0
-; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
-; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
-; CHECK-RV32-NEXT:    vsrl.vv v16, v8, v16
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v12
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
+; CHECK-RV32-NEXT:    vmv.v.x v16, a0
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
+; CHECK-RV32-NEXT:    vand.vi v16, v16, 1
+; CHECK-RV32-NEXT:    vsrl.vv v12, v8, v12
+; CHECK-RV32-NEXT:    vsll.vv v8, v8, v16
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_rotl_v8i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll
index d241b78e41391..7bac239cfffea 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll
@@ -5,12 +5,13 @@
 define <8 x i64> @vwadd_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) {
 ; CHECK-LABEL: vwadd_wv_mask_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, mu
-; CHECK-NEXT:    vwadd.wv v12, v12, v8, v0.t
 ; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, mu
+; CHECK-NEXT:    vwadd.wv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
     %mask = icmp slt <8 x i32> %x, <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
     %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
@@ -22,12 +23,13 @@ define <8 x i64> @vwadd_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) {
 define <8 x i64> @vwaddu_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) {
 ; CHECK-LABEL: vwaddu_wv_mask_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, mu
-; CHECK-NEXT:    vwaddu.wv v12, v12, v8, v0.t
 ; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, mu
+; CHECK-NEXT:    vwaddu.wv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
     %mask = icmp slt <8 x i32> %x, <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
     %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
@@ -39,13 +41,13 @@ define <8 x i64> @vwaddu_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) {
 define <8 x i64> @vwaddu_vv_mask_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: vwaddu_vv_mask_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT:    vwaddu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmerge.vvm v14, v10, v8, v0
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
     %mask = icmp slt <8 x i32> %x, <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
     %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
@@ -58,12 +60,13 @@ define <8 x i64> @vwaddu_vv_mask_v8i32(<8 x i32> %x, <8 x i32> %y) {
 define <8 x i64> @vwadd_wv_mask_v8i32_commutative(<8 x i32> %x, <8 x i64> %y) {
 ; CHECK-LABEL: vwadd_wv_mask_v8i32_commutative:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, mu
-; CHECK-NEXT:    vwadd.wv v12, v12, v8, v0.t
 ; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, mu
+; CHECK-NEXT:    vwadd.wv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
     %mask = icmp slt <8 x i32> %x, <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
     %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
index 98f246b8741dc..feafde0114a7d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
@@ -900,11 +900,12 @@ define <4 x i64> @crash(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-LABEL: crash:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf4 v10, v8
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vsext.vf4 v8, v11
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v8, v9
-; CHECK-NEXT:    vwaddu.wv v10, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vzext.vf2 v11, v10
+; CHECK-NEXT:    vwaddu.wv v8, v8, v11
 ; CHECK-NEXT:    ret
   %a = sext <4 x i16> %x to <4 x i64>
   %b = zext <4 x i16> %y to <4 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
index 7b12a56b78661..2c9aed6274dd8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
@@ -20,8 +20,9 @@ define <4 x i64> @vwsll_vv_v4i64_sext(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %y = sext <4 x i32> %b to <4 x i64>
@@ -41,8 +42,9 @@ define <4 x i64> @vwsll_vv_v4i64_zext(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %y = zext <4 x i32> %b to <4 x i64>
@@ -61,8 +63,8 @@ define <4 x i64> @vwsll_vx_i64_v4i64(<4 x i32> %a, i64 %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <4 x i64> %head, <4 x i64> poison, <4 x i32> zeroinitializer
@@ -75,19 +77,19 @@ define <4 x i64> @vwsll_vx_i32_v4i64_sext(<4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: vwsll_vx_i32_v4i64_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vmv.v.x v11, a0
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v10, v11
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -101,18 +103,18 @@ define <4 x i64> @vwsll_vx_i32_v4i64_zext(<4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: vwsll_vx_i32_v4i64_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -126,18 +128,18 @@ define <4 x i64> @vwsll_vx_i16_v4i64_sext(<4 x i32> %a, i16 %b) {
 ; CHECK-LABEL: vwsll_vx_i16_v4i64_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -151,18 +153,18 @@ define <4 x i64> @vwsll_vx_i16_v4i64_zext(<4 x i32> %a, i16 %b) {
 ; CHECK-LABEL: vwsll_vx_i16_v4i64_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -176,18 +178,18 @@ define <4 x i64> @vwsll_vx_i8_v4i64_sext(<4 x i32> %a, i8 %b) {
 ; CHECK-LABEL: vwsll_vx_i8_v4i64_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf8 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf8 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -201,18 +203,18 @@ define <4 x i64> @vwsll_vx_i8_v4i64_zext(<4 x i32> %a, i8 %b) {
 ; CHECK-LABEL: vwsll_vx_i8_v4i64_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf8 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf8 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -225,17 +227,17 @@ define <4 x i64> @vwsll_vx_i8_v4i64_zext(<4 x i32> %a, i8 %b) {
 define <4 x i64> @vwsll_vi_v4i64(<4 x i32> %a) {
 ; CHECK-LABEL: vwsll_vi_v4i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 4
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vi v8, v10, 2
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %z = shl <4 x i64> %x, splat (i64 2)
@@ -258,8 +260,9 @@ define <8 x i32> @vwsll_vv_v8i32_sext(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vv_v8i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %y = sext <8 x i16> %b to <8 x i32>
@@ -279,8 +282,9 @@ define <8 x i32> @vwsll_vv_v8i32_zext(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vv_v8i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %y = zext <8 x i16> %b to <8 x i32>
@@ -299,8 +303,8 @@ define <8 x i32> @vwsll_vx_i64_v8i32(<8 x i16> %a, i64 %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <8 x i64> %head, <8 x i64> poison, <8 x i32> zeroinitializer
@@ -321,8 +325,8 @@ define <8 x i32> @vwsll_vx_i32_v8i32(<8 x i16> %a, i32 %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <8 x i32> %head, <8 x i32> poison, <8 x i32> zeroinitializer
@@ -335,19 +339,19 @@ define <8 x i32> @vwsll_vx_i16_v8i32_sext(<8 x i16> %a, i16 %b) {
 ; CHECK-LABEL: vwsll_vx_i16_v8i32_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v8i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vmv.v.x v11, a0
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v10, v11
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <8 x i16> %head, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -361,18 +365,18 @@ define <8 x i32> @vwsll_vx_i16_v8i32_zext(<8 x i16> %a, i16 %b) {
 ; CHECK-LABEL: vwsll_vx_i16_v8i32_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v8i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <8 x i16> %head, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -386,18 +390,18 @@ define <8 x i32> @vwsll_vx_i8_v8i32_sext(<8 x i16> %a, i8 %b) {
 ; CHECK-LABEL: vwsll_vx_i8_v8i32_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v8i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -411,18 +415,18 @@ define <8 x i32> @vwsll_vx_i8_v8i32_zext(<8 x i16> %a, i8 %b) {
 ; CHECK-LABEL: vwsll_vx_i8_v8i32_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v8i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -435,17 +439,17 @@ define <8 x i32> @vwsll_vx_i8_v8i32_zext(<8 x i16> %a, i8 %b) {
 define <8 x i32> @vwsll_vi_v8i32(<8 x i16> %a) {
 ; CHECK-LABEL: vwsll_vi_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vi v8, v10, 2
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %z = shl <8 x i32> %x, splat (i32 2)
@@ -468,8 +472,9 @@ define <16 x i16> @vwsll_vv_v16i16_sext(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vv_v16i16_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %y = sext <16 x i8> %b to <16 x i16>
@@ -489,8 +494,9 @@ define <16 x i16> @vwsll_vv_v16i16_zext(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vv_v16i16_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %y = zext <16 x i8> %b to <16 x i16>
@@ -506,12 +512,12 @@ define <16 x i16> @vwsll_vx_i64_v16i16(<16 x i8> %a, i64 %b) {
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgather.vi v24, v16, 0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v10, v8
+; RV32-NEXT:    vzext.vf2 v12, v8
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v12, v24, 0
+; RV32-NEXT:    vnsrl.wi v8, v24, 0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vnsrl.wi v8, v12, 0
-; RV32-NEXT:    vsll.vv v8, v10, v8
+; RV32-NEXT:    vnsrl.wi v14, v8, 0
+; RV32-NEXT:    vsll.vv v8, v12, v14
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vwsll_vx_i64_v16i16:
@@ -519,12 +525,12 @@ define <16 x i16> @vwsll_vx_i64_v16i16(<16 x i8> %a, i64 %b) {
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vmv.v.x v16, a0
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v10, v8
+; RV64-NEXT:    vzext.vf2 v12, v8
 ; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64-NEXT:    vnsrl.wi v12, v16, 0
+; RV64-NEXT:    vnsrl.wi v8, v16, 0
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vnsrl.wi v8, v12, 0
-; RV64-NEXT:    vsll.vv v8, v10, v8
+; RV64-NEXT:    vnsrl.wi v14, v8, 0
+; RV64-NEXT:    vsll.vv v8, v12, v14
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-RV32-LABEL: vwsll_vx_i64_v16i16:
@@ -534,19 +540,19 @@ define <16 x i16> @vwsll_vx_i64_v16i16(<16 x i8> %a, i64 %b) {
 ; CHECK-ZVBB-RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-ZVBB-RV32-NEXT:    vrgather.vi v24, v16, 0
 ; CHECK-ZVBB-RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-RV32-NEXT:    vzext.vf2 v10, v8
+; CHECK-ZVBB-RV32-NEXT:    vzext.vf2 v12, v8
 ; CHECK-ZVBB-RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-ZVBB-RV32-NEXT:    vnsrl.wi v12, v24, 0
+; CHECK-ZVBB-RV32-NEXT:    vnsrl.wi v8, v24, 0
 ; CHECK-ZVBB-RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-RV32-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK-ZVBB-RV32-NEXT:    vsll.vv v8, v10, v8
+; CHECK-ZVBB-RV32-NEXT:    vnsrl.wi v14, v8, 0
+; CHECK-ZVBB-RV32-NEXT:    vsll.vv v8, v12, v14
 ; CHECK-ZVBB-RV32-NEXT:    ret
 ;
 ; CHECK-ZVBB-RV64-LABEL: vwsll_vx_i64_v16i16:
 ; CHECK-ZVBB-RV64:       # %bb.0:
 ; CHECK-ZVBB-RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-RV64-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-RV64-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-RV64-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-RV64-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-RV64-NEXT:    ret
   %head = insertelement <8 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <8 x i64> %head, <8 x i64> poison, <16 x i32> zeroinitializer
@@ -570,8 +576,8 @@ define <16 x i16> @vwsll_vx_i32_v16i16(<16 x i8> %a, i32 %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <16 x i32> %head, <16 x i32> poison, <16 x i32> zeroinitializer
@@ -592,8 +598,8 @@ define <16 x i16> @vwsll_vx_i16_v16i16(<16 x i8> %a, i16 %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <16 x i16> %head, <16 x i16> poison, <16 x i32> zeroinitializer
@@ -606,19 +612,19 @@ define <16 x i16> @vwsll_vx_i8_v16i16_sext(<16 x i8> %a, i8 %b) {
 ; CHECK-LABEL: vwsll_vx_i8_v16i16_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v16i16_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vmv.v.x v11, a0
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v10, v11
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %head, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -632,18 +638,18 @@ define <16 x i16> @vwsll_vx_i8_v16i16_zext(<16 x i8> %a, i8 %b) {
 ; CHECK-LABEL: vwsll_vx_i8_v16i16_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v16i16_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %head, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -656,17 +662,17 @@ define <16 x i16> @vwsll_vx_i8_v16i16_zext(<16 x i8> %a, i8 %b) {
 define <16 x i16> @vwsll_vi_v16i16(<16 x i8> %a) {
 ; CHECK-LABEL: vwsll_vi_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vi v8, v10, 2
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %z = shl <16 x i16> %x, splat (i16 2)
@@ -746,21 +752,21 @@ define <4 x i64> @vwsll_vx_i32_v4i64_v4i8_sext(<4 x i8> %a, i32 %b) {
 ; CHECK-LABEL: vwsll_vx_i32_v4i64_v4i8_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vsext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v4i64_v4i8_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vmv.v.x v12, a0
 ; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsext.vf2 v8, v12
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -774,11 +780,11 @@ define <4 x i64> @vwsll_vx_i32_v4i64_v4i8_zext(<4 x i8> %a, i32 %b) {
 ; CHECK-LABEL: vwsll_vx_i32_v4i64_v4i8_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v4i64_v4i8_zext:
@@ -799,21 +805,21 @@ define <4 x i64> @vwsll_vx_i16_v4i64_v4i8_sext(<4 x i8> %a, i16 %b) {
 ; CHECK-LABEL: vwsll_vx_i16_v4i64_v4i8_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vsext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_v4i8_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vmv.v.x v12, a0
 ; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsext.vf4 v8, v12
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -827,11 +833,11 @@ define <4 x i64> @vwsll_vx_i16_v4i64_v4i8_zext(<4 x i8> %a, i16 %b) {
 ; CHECK-LABEL: vwsll_vx_i16_v4i64_v4i8_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vzext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_v4i8_zext:
@@ -852,21 +858,21 @@ define <4 x i64> @vwsll_vx_i8_v4i64_v4i8_sext(<4 x i8> %a, i8 %b) {
 ; CHECK-LABEL: vwsll_vx_i8_v4i64_v4i8_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vsext.vf8 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf8 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_v4i8_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vmv.v.x v12, a0
 ; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsext.vf8 v8, v12
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -880,11 +886,11 @@ define <4 x i64> @vwsll_vx_i8_v4i64_v4i8_zext(<4 x i8> %a, i8 %b) {
 ; CHECK-LABEL: vwsll_vx_i8_v4i64_v4i8_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vzext.vf8 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf8 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_v4i8_zext:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll
index 382f00913cb41..eafea7292a54b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll
@@ -5,12 +5,13 @@
 define <8 x i64> @vwsub_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) {
 ; CHECK-LABEL: vwsub_wv_mask_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, mu
-; CHECK-NEXT:    vwsub.wv v12, v12, v8, v0.t
 ; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, mu
+; CHECK-NEXT:    vwsub.wv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
     %mask = icmp slt <8 x i32> %x, <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
     %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
@@ -22,12 +23,13 @@ define <8 x i64> @vwsub_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) {
 define <8 x i64> @vwsubu_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) {
 ; CHECK-LABEL: vwsubu_wv_mask_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, mu
-; CHECK-NEXT:    vwsubu.wv v12, v12, v8, v0.t
 ; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, mu
+; CHECK-NEXT:    vwsubu.wv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
     %mask = icmp slt <8 x i32> %x, <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
     %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
@@ -39,13 +41,13 @@ define <8 x i64> @vwsubu_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) {
 define <8 x i64> @vwsubu_vv_mask_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: vwsubu_vv_mask_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT:    vwsubu.vv v12, v10, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmerge.vvm v14, v10, v8, v0
+; CHECK-NEXT:    vwsubu.vv v8, v12, v14
 ; CHECK-NEXT:    ret
     %mask = icmp slt <8 x i32> %x, <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
     %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
index df90dae379c06..8259336e8668c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
@@ -56,8 +56,8 @@ define <4 x i64> @vzext_v4i64_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl)
 ; CHECK-LABEL: vzext_v4i64_v4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf8 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf8 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> %va, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
@@ -67,8 +67,8 @@ define <4 x i64> @vzext_v4i64_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vzext_v4i64_v4i8_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf8 v8, v10
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
@@ -104,8 +104,8 @@ define <4 x i64> @vzext_v4i64_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl
 ; CHECK-LABEL: vzext_v4i64_v4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf4 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf4 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> %va, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
@@ -115,8 +115,8 @@ define <4 x i64> @vzext_v4i64_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vzext_v4i64_v4i16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf4 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf4 v8, v10
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
@@ -128,8 +128,8 @@ define <4 x i64> @vzext_v4i64_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl
 ; CHECK-LABEL: vzext_v4i64_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf2 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
@@ -139,8 +139,8 @@ define <4 x i64> @vzext_v4i64_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vzext_v4i64_v4i32_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
@@ -153,24 +153,25 @@ define <32 x i64> @vzext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB12_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB12_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vzext.vf2 v24, v8, v0.t
+; CHECK-NEXT:    vzext.vf2 v16, v8, v0.t
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vzext.vf2 v24, v8, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv.v.v v16, v24
 ; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> %va, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
index b7661bd826fed..4512d809995a4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
@@ -358,15 +358,15 @@ define <vscale x 4 x i64> @trunc_nxv4f32_to_si64(<vscale x 4 x float> %x) {
 ; RV32-LABEL: trunc_nxv4f32_to_si64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfwcvt.rtz.x.f.v v12, v8
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vfwcvt.rtz.x.f.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: trunc_nxv4f32_to_si64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfwcvt.rtz.x.f.v v12, v8
-; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    vmv2r.v v12, v8
+; RV64-NEXT:    vfwcvt.rtz.x.f.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> %x)
   %b = fptosi <vscale x 4 x float> %a to <vscale x 4 x i64>
@@ -377,15 +377,15 @@ define <vscale x 4 x i64> @trunc_nxv4f32_to_ui64(<vscale x 4 x float> %x) {
 ; RV32-LABEL: trunc_nxv4f32_to_ui64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: trunc_nxv4f32_to_ui64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
-; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    vmv2r.v v12, v8
+; RV64-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> %x)
   %b = fptoui <vscale x 4 x float> %a to <vscale x 4 x i64>
@@ -801,20 +801,20 @@ define <vscale x 4 x i32> @ceil_nxv4f32_to_ui32(<vscale x 4 x float> %x) {
 define <vscale x 4 x i64> @ceil_nxv4f32_to_si64(<vscale x 4 x float> %x) {
 ; RV32-LABEL: ceil_nxv4f32_to_si64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv2r.v v12, v8
 ; RV32-NEXT:    fsrmi a0, 3
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfwcvt.x.f.v v12, v8
+; RV32-NEXT:    vfwcvt.x.f.v v8, v12
 ; RV32-NEXT:    fsrm a0
-; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ceil_nxv4f32_to_si64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmv2r.v v12, v8
 ; RV64-NEXT:    fsrmi a0, 3
-; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfwcvt.x.f.v v12, v8
+; RV64-NEXT:    vfwcvt.x.f.v v8, v12
 ; RV64-NEXT:    fsrm a0
-; RV64-NEXT:    vmv4r.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> %x)
   %b = fptosi <vscale x 4 x float> %a to <vscale x 4 x i64>
@@ -824,20 +824,20 @@ define <vscale x 4 x i64> @ceil_nxv4f32_to_si64(<vscale x 4 x float> %x) {
 define <vscale x 4 x i64> @ceil_nxv4f32_to_ui64(<vscale x 4 x float> %x) {
 ; RV32-LABEL: ceil_nxv4f32_to_ui64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv2r.v v12, v8
 ; RV32-NEXT:    fsrmi a0, 3
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfwcvt.xu.f.v v12, v8
+; RV32-NEXT:    vfwcvt.xu.f.v v8, v12
 ; RV32-NEXT:    fsrm a0
-; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ceil_nxv4f32_to_ui64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmv2r.v v12, v8
 ; RV64-NEXT:    fsrmi a0, 3
-; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfwcvt.xu.f.v v12, v8
+; RV64-NEXT:    vfwcvt.xu.f.v v8, v12
 ; RV64-NEXT:    fsrm a0
-; RV64-NEXT:    vmv4r.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> %x)
   %b = fptoui <vscale x 4 x float> %a to <vscale x 4 x i64>
@@ -1004,15 +1004,15 @@ define <vscale x 4 x i64> @rint_nxv4f32_to_si64(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_si64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfwcvt.x.f.v v12, v8
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vfwcvt.x.f.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_si64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfwcvt.x.f.v v12, v8
-; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    vmv2r.v v12, v8
+; RV64-NEXT:    vfwcvt.x.f.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
   %b = fptosi <vscale x 4 x float> %a to <vscale x 4 x i64>
@@ -1023,15 +1023,15 @@ define <vscale x 4 x i64> @rint_nxv4f32_to_ui64(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_ui64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfwcvt.xu.f.v v12, v8
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vfwcvt.xu.f.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_ui64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfwcvt.xu.f.v v12, v8
-; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    vmv2r.v v12, v8
+; RV64-NEXT:    vfwcvt.xu.f.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
   %b = fptoui <vscale x 4 x float> %a to <vscale x 4 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 21fa551ad75df..6e214578d47bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -406,12 +406,12 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v7, v24, a2
+; CHECK-NEXT:    vslidedown.vx v7, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
@@ -1050,12 +1050,12 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v7, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index b2269848897a2..d44c54dd965e1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -406,12 +406,12 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v7, v24, a2
+; CHECK-NEXT:    vslidedown.vx v7, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
@@ -1050,12 +1050,12 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v7, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll
index 52e5ecf9cb8a1..bc45671077106 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll
@@ -284,11 +284,11 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfwcvt.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v9
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2f16.nxv2i64(<vscale x 2 x half> %f)
     ret <vscale x 2 x i64> %x
@@ -299,11 +299,11 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v12, v10
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f16.nxv4i64(<vscale x 4 x half> %f)
     ret <vscale x 4 x i64> %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll
index 02b43c2d95295..bb5ad6ba9d88a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll
@@ -284,11 +284,11 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfwcvt.f.f.v v9, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v9
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptoui.sat.nxv2f16.nxv2i64(<vscale x 2 x half> %f)
     ret <vscale x 2 x i64> %x
@@ -299,11 +299,11 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v12, v10
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f16.nxv4i64(<vscale x 4 x half> %f)
     ret <vscale x 4 x i64> %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
index 4d47c265a9747..ae0542fb5b74f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
@@ -178,8 +178,8 @@ define <vscale x 4 x i32> @trunc_nxv4f16_to_si32(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: trunc_nxv4f16_to_si32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.trunc.nxv4f16(<vscale x 4 x half> %x)
   %b = fptosi <vscale x 4 x half> %a to <vscale x 4 x i32>
@@ -190,8 +190,8 @@ define <vscale x 4 x i32> @trunc_nxv4f16_to_ui32(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: trunc_nxv4f16_to_ui32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.trunc.nxv4f16(<vscale x 4 x half> %x)
   %b = fptoui <vscale x 4 x half> %a to <vscale x 4 x i32>
@@ -505,11 +505,11 @@ define <vscale x 4 x i16> @ceil_nxv4f16_to_ui16(<vscale x 4 x half> %x) {
 define <vscale x 4 x i32> @ceil_nxv4f16_to_si32(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: ceil_nxv4f16_to_si32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v10, v8
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v10
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.ceil.nxv4f16(<vscale x 4 x half> %x)
   %b = fptosi <vscale x 4 x half> %a to <vscale x 4 x i32>
@@ -519,11 +519,11 @@ define <vscale x 4 x i32> @ceil_nxv4f16_to_si32(<vscale x 4 x half> %x) {
 define <vscale x 4 x i32> @ceil_nxv4f16_to_ui32(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: ceil_nxv4f16_to_ui32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.xu.f.v v10, v8
+; CHECK-NEXT:    vfwcvt.xu.f.v v8, v10
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.ceil.nxv4f16(<vscale x 4 x half> %x)
   %b = fptoui <vscale x 4 x half> %a to <vscale x 4 x i32>
@@ -886,8 +886,8 @@ define <vscale x 4 x i32> @rint_nxv4f16_to_si32(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_si32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v10
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
   %b = fptosi <vscale x 4 x half> %a to <vscale x 4 x i32>
@@ -898,8 +898,8 @@ define <vscale x 4 x i32> @rint_nxv4f16_to_ui32(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_ui32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.xu.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.xu.f.v v8, v10
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
   %b = fptoui <vscale x 4 x half> %a to <vscale x 4 x i32>
diff --git a/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll b/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll
index 7e2ec46339b33..e7b85c5d6902b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll
@@ -7,28 +7,28 @@ define void @interleave256(ptr %agg.result, ptr %0, ptr %1) {
 ; RV64-1024:       # %bb.0: # %entry
 ; RV64-1024-NEXT:    li a3, 128
 ; RV64-1024-NEXT:    vsetvli zero, a3, e16, m2, ta, ma
-; RV64-1024-NEXT:    vle16.v v8, (a1)
-; RV64-1024-NEXT:    vle16.v v10, (a2)
+; RV64-1024-NEXT:    vle16.v v12, (a1)
+; RV64-1024-NEXT:    vle16.v v14, (a2)
 ; RV64-1024-NEXT:    li a1, -1
-; RV64-1024-NEXT:    vwaddu.vv v12, v8, v10
-; RV64-1024-NEXT:    vwmaccu.vx v12, a1, v10
+; RV64-1024-NEXT:    vwaddu.vv v8, v12, v14
+; RV64-1024-NEXT:    vwmaccu.vx v8, a1, v14
 ; RV64-1024-NEXT:    li a1, 256
 ; RV64-1024-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; RV64-1024-NEXT:    vse16.v v12, (a0)
+; RV64-1024-NEXT:    vse16.v v8, (a0)
 ; RV64-1024-NEXT:    ret
 ;
 ; RV64-2048-LABEL: interleave256:
 ; RV64-2048:       # %bb.0: # %entry
 ; RV64-2048-NEXT:    li a3, 128
 ; RV64-2048-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; RV64-2048-NEXT:    vle16.v v8, (a1)
-; RV64-2048-NEXT:    vle16.v v9, (a2)
+; RV64-2048-NEXT:    vle16.v v10, (a1)
+; RV64-2048-NEXT:    vle16.v v11, (a2)
 ; RV64-2048-NEXT:    li a1, -1
-; RV64-2048-NEXT:    vwaddu.vv v10, v8, v9
-; RV64-2048-NEXT:    vwmaccu.vx v10, a1, v9
+; RV64-2048-NEXT:    vwaddu.vv v8, v10, v11
+; RV64-2048-NEXT:    vwmaccu.vx v8, a1, v11
 ; RV64-2048-NEXT:    li a1, 256
 ; RV64-2048-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV64-2048-NEXT:    vse16.v v10, (a0)
+; RV64-2048-NEXT:    vse16.v v8, (a0)
 ; RV64-2048-NEXT:    ret
 entry:
   %ve = load <128 x i16>, ptr %0, align 256
@@ -45,28 +45,28 @@ define void @interleave512(ptr %agg.result, ptr %0, ptr %1) local_unnamed_addr {
 ; RV64-1024:       # %bb.0: # %entry
 ; RV64-1024-NEXT:    li a3, 256
 ; RV64-1024-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; RV64-1024-NEXT:    vle16.v v8, (a1)
-; RV64-1024-NEXT:    vle16.v v12, (a2)
+; RV64-1024-NEXT:    vle16.v v16, (a1)
+; RV64-1024-NEXT:    vle16.v v20, (a2)
 ; RV64-1024-NEXT:    li a1, -1
-; RV64-1024-NEXT:    vwaddu.vv v16, v8, v12
-; RV64-1024-NEXT:    vwmaccu.vx v16, a1, v12
+; RV64-1024-NEXT:    vwaddu.vv v8, v16, v20
+; RV64-1024-NEXT:    vwmaccu.vx v8, a1, v20
 ; RV64-1024-NEXT:    li a1, 512
 ; RV64-1024-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; RV64-1024-NEXT:    vse16.v v16, (a0)
+; RV64-1024-NEXT:    vse16.v v8, (a0)
 ; RV64-1024-NEXT:    ret
 ;
 ; RV64-2048-LABEL: interleave512:
 ; RV64-2048:       # %bb.0: # %entry
 ; RV64-2048-NEXT:    li a3, 256
 ; RV64-2048-NEXT:    vsetvli zero, a3, e16, m2, ta, ma
-; RV64-2048-NEXT:    vle16.v v8, (a1)
-; RV64-2048-NEXT:    vle16.v v10, (a2)
+; RV64-2048-NEXT:    vle16.v v12, (a1)
+; RV64-2048-NEXT:    vle16.v v14, (a2)
 ; RV64-2048-NEXT:    li a1, -1
-; RV64-2048-NEXT:    vwaddu.vv v12, v8, v10
-; RV64-2048-NEXT:    vwmaccu.vx v12, a1, v10
+; RV64-2048-NEXT:    vwaddu.vv v8, v12, v14
+; RV64-2048-NEXT:    vwmaccu.vx v8, a1, v14
 ; RV64-2048-NEXT:    li a1, 512
 ; RV64-2048-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; RV64-2048-NEXT:    vse16.v v12, (a0)
+; RV64-2048-NEXT:    vse16.v v8, (a0)
 ; RV64-2048-NEXT:    ret
 entry:
   %ve = load <256 x i16>, ptr %0, align 512
diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
index 7d37d91ee21b5..c5e1852923d54 100644
--- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
@@ -37,12 +37,12 @@ define <vscale x 16 x i1> @match_nxv16i8_v4i8(<vscale x 16 x i8> %op1, <4 x i8>
 ; CHECK-NEXT:    vmseq.vv v14, v8, v12
 ; CHECK-NEXT:    vrgather.vi v12, v10, 0
 ; CHECK-NEXT:    vmseq.vv v15, v8, v12
-; CHECK-NEXT:    vmor.mm v12, v15, v14
-; CHECK-NEXT:    vrgather.vi v14, v10, 2
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vrgather.vi v14, v10, 3
-; CHECK-NEXT:    vmor.mm v10, v12, v13
-; CHECK-NEXT:    vmseq.vv v11, v8, v14
+; CHECK-NEXT:    vmor.mm v14, v15, v14
+; CHECK-NEXT:    vrgather.vi v12, v10, 2
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vrgather.vi v12, v10, 3
+; CHECK-NEXT:    vmor.mm v10, v14, v15
+; CHECK-NEXT:    vmseq.vv v11, v8, v12
 ; CHECK-NEXT:    vmor.mm v8, v10, v11
 ; CHECK-NEXT:    vmand.mm v0, v8, v0
 ; CHECK-NEXT:    ret
@@ -58,24 +58,24 @@ define <vscale x 16 x i1> @match_nxv16i8_v8i8(<vscale x 16 x i8> %op1, <8 x i8>
 ; CHECK-NEXT:    vmseq.vv v14, v8, v12
 ; CHECK-NEXT:    vrgather.vi v12, v10, 0
 ; CHECK-NEXT:    vmseq.vv v15, v8, v12
-; CHECK-NEXT:    vmor.mm v12, v15, v14
-; CHECK-NEXT:    vrgather.vi v14, v10, 2
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 3
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 4
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 5
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 6
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vrgather.vi v14, v10, 7
-; CHECK-NEXT:    vmor.mm v10, v12, v13
-; CHECK-NEXT:    vmseq.vv v11, v8, v14
+; CHECK-NEXT:    vmor.mm v14, v15, v14
+; CHECK-NEXT:    vrgather.vi v12, v10, 2
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 3
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 4
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 5
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 6
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vrgather.vi v12, v10, 7
+; CHECK-NEXT:    vmor.mm v10, v14, v15
+; CHECK-NEXT:    vmseq.vv v11, v8, v12
 ; CHECK-NEXT:    vmor.mm v8, v10, v11
 ; CHECK-NEXT:    vmand.mm v0, v8, v0
 ; CHECK-NEXT:    ret
@@ -91,48 +91,48 @@ define <vscale x 16 x i1> @match_nxv16i8_v16i8(<vscale x 16 x i8> %op1, <16 x i8
 ; CHECK-NEXT:    vmseq.vv v14, v8, v12
 ; CHECK-NEXT:    vrgather.vi v12, v10, 0
 ; CHECK-NEXT:    vmseq.vv v15, v8, v12
-; CHECK-NEXT:    vmor.mm v12, v15, v14
-; CHECK-NEXT:    vrgather.vi v14, v10, 2
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 3
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 4
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 5
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 6
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 7
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 8
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 9
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 10
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 11
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 12
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 13
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 14
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vrgather.vi v14, v10, 15
-; CHECK-NEXT:    vmor.mm v10, v12, v13
-; CHECK-NEXT:    vmseq.vv v11, v8, v14
+; CHECK-NEXT:    vmor.mm v14, v15, v14
+; CHECK-NEXT:    vrgather.vi v12, v10, 2
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 3
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 4
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 5
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 6
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 7
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 8
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 9
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 10
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 11
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 12
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 13
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 14
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vrgather.vi v12, v10, 15
+; CHECK-NEXT:    vmor.mm v10, v14, v15
+; CHECK-NEXT:    vmseq.vv v11, v8, v12
 ; CHECK-NEXT:    vmor.mm v8, v10, v11
 ; CHECK-NEXT:    vmand.mm v0, v8, v0
 ; CHECK-NEXT:    ret
@@ -319,24 +319,24 @@ define <vscale x 8 x i1> @match_nxv8i16_v8i16(<vscale x 8 x i16> %op1, <8 x i16>
 ; CHECK-NEXT:    vmseq.vv v14, v8, v12
 ; CHECK-NEXT:    vrgather.vi v12, v10, 0
 ; CHECK-NEXT:    vmseq.vv v15, v8, v12
-; CHECK-NEXT:    vmor.mm v12, v15, v14
-; CHECK-NEXT:    vrgather.vi v14, v10, 2
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 3
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 4
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 5
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vmor.mm v12, v12, v13
-; CHECK-NEXT:    vrgather.vi v14, v10, 6
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vrgather.vi v14, v10, 7
-; CHECK-NEXT:    vmor.mm v10, v12, v13
-; CHECK-NEXT:    vmseq.vv v11, v8, v14
+; CHECK-NEXT:    vmor.mm v14, v15, v14
+; CHECK-NEXT:    vrgather.vi v12, v10, 2
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 3
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 4
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 5
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vmor.mm v14, v14, v15
+; CHECK-NEXT:    vrgather.vi v12, v10, 6
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vrgather.vi v12, v10, 7
+; CHECK-NEXT:    vmor.mm v10, v14, v15
+; CHECK-NEXT:    vmseq.vv v11, v8, v12
 ; CHECK-NEXT:    vmor.mm v8, v10, v11
 ; CHECK-NEXT:    vmand.mm v0, v8, v0
 ; CHECK-NEXT:    ret
@@ -966,12 +966,12 @@ define <vscale x 4 x i1> @match_nxv4xi32_v4i32(<vscale x 4 x i32> %op1, <4 x i32
 ; CHECK-NEXT:    vmseq.vv v14, v8, v12
 ; CHECK-NEXT:    vrgather.vi v12, v10, 0
 ; CHECK-NEXT:    vmseq.vv v15, v8, v12
-; CHECK-NEXT:    vmor.mm v12, v15, v14
-; CHECK-NEXT:    vrgather.vi v14, v10, 2
-; CHECK-NEXT:    vmseq.vv v13, v8, v14
-; CHECK-NEXT:    vrgather.vi v14, v10, 3
-; CHECK-NEXT:    vmor.mm v10, v12, v13
-; CHECK-NEXT:    vmseq.vv v11, v8, v14
+; CHECK-NEXT:    vmor.mm v14, v15, v14
+; CHECK-NEXT:    vrgather.vi v12, v10, 2
+; CHECK-NEXT:    vmseq.vv v15, v8, v12
+; CHECK-NEXT:    vrgather.vi v12, v10, 3
+; CHECK-NEXT:    vmor.mm v10, v14, v15
+; CHECK-NEXT:    vmseq.vv v11, v8, v12
 ; CHECK-NEXT:    vmor.mm v8, v10, v11
 ; CHECK-NEXT:    vmand.mm v0, v8, v0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/llrint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/llrint-sdnode.ll
index 818abb9f4a009..66c64c6b275f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/llrint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/llrint-sdnode.ll
@@ -20,8 +20,8 @@ define <vscale x 2 x i64> @llrint_nxv2i64_nxv2f32(<vscale x 2 x float> %x) {
 ; CHECK-LABEL: llrint_nxv2i64_nxv2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v10
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f32(<vscale x 2 x float> %x)
   ret <vscale x 2 x i64> %a
@@ -32,8 +32,8 @@ define <vscale x 4 x i64> @llrint_nxv4i64_nxv4f32(<vscale x 4 x float> %x) {
 ; CHECK-LABEL: llrint_nxv4i64_nxv4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v12
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f32(<vscale x 4 x float> %x)
   ret <vscale x 4 x i64> %a
@@ -44,8 +44,8 @@ define <vscale x 8 x i64> @llrint_nxv8i64_nxv8f32(<vscale x 8 x float> %x) {
 ; CHECK-LABEL: llrint_nxv8i64_nxv8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v16
 ; CHECK-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.llrint.nxv8i64.nxv8f32(<vscale x 8 x float> %x)
   ret <vscale x 8 x i64> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
index 166dba6a56524..dbe2d03e1a909 100644
--- a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
@@ -20,8 +20,8 @@ define <vscale x 2 x i64> @llrint_nxv2i64_nxv2f32(<vscale x 2 x float> %x, <vsca
 ; CHECK-LABEL: llrint_nxv2i64_nxv2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.vp.llrint.nxv2i64.nxv2f32(<vscale x 2 x float> %x, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i64> %a
@@ -32,8 +32,8 @@ define <vscale x 4 x i64> @llrint_nxv4i64_nxv4f32(<vscale x 4 x float> %x, <vsca
 ; CHECK-LABEL: llrint_nxv4i64_nxv4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x i64> @llvm.vp.llrint.nxv4i64.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i64> %a
@@ -44,8 +44,8 @@ define <vscale x 8 x i64> @llrint_nxv8i64_nxv8f32(<vscale x 8 x float> %x, <vsca
 ; CHECK-LABEL: llrint_nxv8i64_nxv8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.vp.llrint.nxv8i64.nxv8f32(<vscale x 8 x float> %x, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i64> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/lrint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/lrint-sdnode.ll
index a9668dff6055a..534fdf21675ce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/lrint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/lrint-sdnode.ll
@@ -46,8 +46,8 @@ define <vscale x 2 x iXLen> @lrint_nxv2f32(<vscale x 2 x float> %x) {
 ; RV64-i64-LABEL: lrint_nxv2f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV64-i64-NEXT:    vfwcvt.x.f.v v10, v8
-; RV64-i64-NEXT:    vmv2r.v v8, v10
+; RV64-i64-NEXT:    vmv1r.v v10, v8
+; RV64-i64-NEXT:    vfwcvt.x.f.v v8, v10
 ; RV64-i64-NEXT:    ret
   %a = call <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f32(<vscale x 2 x float> %x)
   ret <vscale x 2 x iXLen> %a
@@ -70,8 +70,8 @@ define <vscale x 4 x iXLen> @lrint_nxv4f32(<vscale x 4 x float> %x) {
 ; RV64-i64-LABEL: lrint_nxv4f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-i64-NEXT:    vfwcvt.x.f.v v12, v8
-; RV64-i64-NEXT:    vmv4r.v v8, v12
+; RV64-i64-NEXT:    vmv2r.v v12, v8
+; RV64-i64-NEXT:    vfwcvt.x.f.v v8, v12
 ; RV64-i64-NEXT:    ret
   %a = call <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f32(<vscale x 4 x float> %x)
   ret <vscale x 4 x iXLen> %a
@@ -94,8 +94,8 @@ define <vscale x 8 x iXLen> @lrint_nxv8f32(<vscale x 8 x float> %x) {
 ; RV64-i64-LABEL: lrint_nxv8f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; RV64-i64-NEXT:    vfwcvt.x.f.v v16, v8
-; RV64-i64-NEXT:    vmv8r.v v8, v16
+; RV64-i64-NEXT:    vmv4r.v v16, v8
+; RV64-i64-NEXT:    vfwcvt.x.f.v v8, v16
 ; RV64-i64-NEXT:    ret
   %a = call <vscale x 8 x iXLen> @llvm.lrint.nxv8iXLen.nxv8f32(<vscale x 8 x float> %x)
   ret <vscale x 8 x iXLen> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
index 21045b69a8b5d..98d32b36c23c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
@@ -46,8 +46,8 @@ define <vscale x 2 x iXLen> @lrint_nxv2f32(<vscale x 2 x float> %x, <vscale x 2
 ; RV64-i64-LABEL: lrint_nxv2f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; RV64-i64-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
-; RV64-i64-NEXT:    vmv2r.v v8, v10
+; RV64-i64-NEXT:    vmv1r.v v10, v8
+; RV64-i64-NEXT:    vfwcvt.x.f.v v8, v10, v0.t
 ; RV64-i64-NEXT:    ret
   %a = call <vscale x 2 x iXLen> @llvm.vp.lrint.nxv2iXLen.nxv2f32(<vscale x 2 x float> %x, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x iXLen> %a
@@ -70,8 +70,8 @@ define <vscale x 4 x iXLen> @lrint_nxv4f32(<vscale x 4 x float> %x, <vscale x 4
 ; RV64-i64-LABEL: lrint_nxv4f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; RV64-i64-NEXT:    vfwcvt.x.f.v v12, v8, v0.t
-; RV64-i64-NEXT:    vmv4r.v v8, v12
+; RV64-i64-NEXT:    vmv2r.v v12, v8
+; RV64-i64-NEXT:    vfwcvt.x.f.v v8, v12, v0.t
 ; RV64-i64-NEXT:    ret
   %a = call <vscale x 4 x iXLen> @llvm.vp.lrint.nxv4iXLen.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x iXLen> %a
@@ -94,8 +94,8 @@ define <vscale x 8 x iXLen> @lrint_nxv8f32(<vscale x 8 x float> %x, <vscale x 8
 ; RV64-i64-LABEL: lrint_nxv8f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; RV64-i64-NEXT:    vfwcvt.x.f.v v16, v8, v0.t
-; RV64-i64-NEXT:    vmv8r.v v8, v16
+; RV64-i64-NEXT:    vmv4r.v v16, v8
+; RV64-i64-NEXT:    vfwcvt.x.f.v v8, v16, v0.t
 ; RV64-i64-NEXT:    ret
   %a = call <vscale x 8 x iXLen> @llvm.vp.lrint.nxv8iXLen.nxv8f32(<vscale x 8 x float> %x, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x iXLen> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index 72ea0be5da28d..2757e140ecde5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -140,10 +140,10 @@ define <vscale x 2 x i64> @mgather_nxv2i8_sextload_nxv2i64(<vscale x 2 x ptr> %p
 ; RV32-LABEL: mgather_nxv2i8_sextload_nxv2i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e8, mf4, ta, mu
-; RV32-NEXT:    vluxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v10, v9
+; RV32-NEXT:    vluxei32.v v10, (zero), v8, v0.t
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsext.vf8 v10, v9
-; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    vsext.vf8 v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_nxv2i8_sextload_nxv2i64:
@@ -162,10 +162,10 @@ define <vscale x 2 x i64> @mgather_nxv2i8_zextload_nxv2i64(<vscale x 2 x ptr> %p
 ; RV32-LABEL: mgather_nxv2i8_zextload_nxv2i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e8, mf4, ta, mu
-; RV32-NEXT:    vluxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v10, v9
+; RV32-NEXT:    vluxei32.v v10, (zero), v8, v0.t
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vzext.vf8 v10, v9
-; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    vzext.vf8 v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_nxv2i8_zextload_nxv2i64:
@@ -363,10 +363,10 @@ define <vscale x 2 x i64> @mgather_nxv2i16_sextload_nxv2i64(<vscale x 2 x ptr> %
 ; RV32-LABEL: mgather_nxv2i16_sextload_nxv2i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
-; RV32-NEXT:    vluxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v10, v9
+; RV32-NEXT:    vluxei32.v v10, (zero), v8, v0.t
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsext.vf4 v10, v9
-; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    vsext.vf4 v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_nxv2i16_sextload_nxv2i64:
@@ -385,10 +385,10 @@ define <vscale x 2 x i64> @mgather_nxv2i16_zextload_nxv2i64(<vscale x 2 x ptr> %
 ; RV32-LABEL: mgather_nxv2i16_zextload_nxv2i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
-; RV32-NEXT:    vluxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v10, v9
+; RV32-NEXT:    vluxei32.v v10, (zero), v8, v0.t
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vzext.vf4 v10, v9
-; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    vzext.vf4 v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_nxv2i16_zextload_nxv2i64:
@@ -610,10 +610,10 @@ define <vscale x 2 x i64> @mgather_nxv2i32_sextload_nxv2i64(<vscale x 2 x ptr> %
 ; RV32-LABEL: mgather_nxv2i32_sextload_nxv2i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
-; RV32-NEXT:    vluxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v10, v9
+; RV32-NEXT:    vluxei32.v v10, (zero), v8, v0.t
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v10, v9
-; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    vsext.vf2 v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_nxv2i32_sextload_nxv2i64:
@@ -632,10 +632,10 @@ define <vscale x 2 x i64> @mgather_nxv2i32_zextload_nxv2i64(<vscale x 2 x ptr> %
 ; RV32-LABEL: mgather_nxv2i32_zextload_nxv2i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
-; RV32-NEXT:    vluxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v10, v9
+; RV32-NEXT:    vluxei32.v v10, (zero), v8, v0.t
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v10, v9
-; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    vzext.vf2 v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_nxv2i32_zextload_nxv2i64:
@@ -941,8 +941,8 @@ define <vscale x 4 x i64> @mgather_truemask_nxv4i64(<vscale x 4 x ptr> %ptrs, <v
 ; RV32-LABEL: mgather_truemask_nxv4i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vluxei32.v v12, (zero), v8
-; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_truemask_nxv4i64:
@@ -1999,8 +1999,8 @@ define <vscale x 4 x double> @mgather_truemask_nxv4f64(<vscale x 4 x ptr> %ptrs,
 ; RV32-LABEL: mgather_truemask_nxv4f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vluxei32.v v12, (zero), v8
-; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_truemask_nxv4f64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
index 9298e8b520bd9..d995a31f243d3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
@@ -206,14 +206,14 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) {
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
 ; RV32-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
-; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v10, v8, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vid.v v10
-; RV32-BITS-UNKNOWN-NEXT:    vrsub.vx v10, v10, a0
+; RV32-BITS-UNKNOWN-NEXT:    vid.v v8
+; RV32-BITS-UNKNOWN-NEXT:    vrsub.vx v8, v8, a0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v9, v8, v10
-; RV32-BITS-UNKNOWN-NEXT:    vmsne.vi v0, v9, 0
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v10, v8
+; RV32-BITS-UNKNOWN-NEXT:    vmsne.vi v0, v11, 0
 ; RV32-BITS-UNKNOWN-NEXT:    ret
 ;
 ; RV32-BITS-256-LABEL: reverse_nxv8i1:
@@ -247,14 +247,14 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) {
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
 ; RV64-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
-; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v10, v8, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vid.v v10
-; RV64-BITS-UNKNOWN-NEXT:    vrsub.vx v10, v10, a0
+; RV64-BITS-UNKNOWN-NEXT:    vid.v v8
+; RV64-BITS-UNKNOWN-NEXT:    vrsub.vx v8, v8, a0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v9, v8, v10
-; RV64-BITS-UNKNOWN-NEXT:    vmsne.vi v0, v9, 0
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v10, v8
+; RV64-BITS-UNKNOWN-NEXT:    vmsne.vi v0, v11, 0
 ; RV64-BITS-UNKNOWN-NEXT:    ret
 ;
 ; RV64-BITS-256-LABEL: reverse_nxv8i1:
@@ -310,38 +310,38 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) {
 ; RV32-BITS-256:       # %bb.0:
 ; RV32-BITS-256-NEXT:    csrr a0, vlenb
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32-BITS-256-NEXT:    vid.v v8
+; RV32-BITS-256-NEXT:    vid.v v10
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; RV32-BITS-256-NEXT:    vmv.v.i v10, 0
+; RV32-BITS-256-NEXT:    vmv.v.i v8, 0
 ; RV32-BITS-256-NEXT:    addi a0, a0, -1
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32-BITS-256-NEXT:    vrsub.vx v8, v8, a0
+; RV32-BITS-256-NEXT:    vrsub.vx v12, v10, a0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV32-BITS-256-NEXT:    vmerge.vim v10, v10, 1, v0
+; RV32-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV32-BITS-256-NEXT:    vrgather.vv v13, v10, v8
-; RV32-BITS-256-NEXT:    vrgather.vv v12, v11, v8
+; RV32-BITS-256-NEXT:    vrgather.vv v11, v8, v12
+; RV32-BITS-256-NEXT:    vrgather.vv v10, v9, v12
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV32-BITS-256-NEXT:    vmsne.vi v0, v12, 0
+; RV32-BITS-256-NEXT:    vmsne.vi v0, v10, 0
 ; RV32-BITS-256-NEXT:    ret
 ;
 ; RV32-BITS-512-LABEL: reverse_nxv16i1:
 ; RV32-BITS-512:       # %bb.0:
 ; RV32-BITS-512-NEXT:    csrr a0, vlenb
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32-BITS-512-NEXT:    vid.v v8
+; RV32-BITS-512-NEXT:    vid.v v10
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; RV32-BITS-512-NEXT:    vmv.v.i v10, 0
+; RV32-BITS-512-NEXT:    vmv.v.i v8, 0
 ; RV32-BITS-512-NEXT:    addi a0, a0, -1
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32-BITS-512-NEXT:    vrsub.vx v8, v8, a0
+; RV32-BITS-512-NEXT:    vrsub.vx v12, v10, a0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV32-BITS-512-NEXT:    vmerge.vim v10, v10, 1, v0
+; RV32-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV32-BITS-512-NEXT:    vrgather.vv v13, v10, v8
-; RV32-BITS-512-NEXT:    vrgather.vv v12, v11, v8
+; RV32-BITS-512-NEXT:    vrgather.vv v11, v8, v12
+; RV32-BITS-512-NEXT:    vrgather.vv v10, v9, v12
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV32-BITS-512-NEXT:    vmsne.vi v0, v12, 0
+; RV32-BITS-512-NEXT:    vmsne.vi v0, v10, 0
 ; RV32-BITS-512-NEXT:    ret
 ;
 ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv16i1:
@@ -367,38 +367,38 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) {
 ; RV64-BITS-256:       # %bb.0:
 ; RV64-BITS-256-NEXT:    csrr a0, vlenb
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64-BITS-256-NEXT:    vid.v v8
+; RV64-BITS-256-NEXT:    vid.v v10
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; RV64-BITS-256-NEXT:    vmv.v.i v10, 0
+; RV64-BITS-256-NEXT:    vmv.v.i v8, 0
 ; RV64-BITS-256-NEXT:    addi a0, a0, -1
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64-BITS-256-NEXT:    vrsub.vx v8, v8, a0
+; RV64-BITS-256-NEXT:    vrsub.vx v12, v10, a0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV64-BITS-256-NEXT:    vmerge.vim v10, v10, 1, v0
+; RV64-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV64-BITS-256-NEXT:    vrgather.vv v13, v10, v8
-; RV64-BITS-256-NEXT:    vrgather.vv v12, v11, v8
+; RV64-BITS-256-NEXT:    vrgather.vv v11, v8, v12
+; RV64-BITS-256-NEXT:    vrgather.vv v10, v9, v12
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV64-BITS-256-NEXT:    vmsne.vi v0, v12, 0
+; RV64-BITS-256-NEXT:    vmsne.vi v0, v10, 0
 ; RV64-BITS-256-NEXT:    ret
 ;
 ; RV64-BITS-512-LABEL: reverse_nxv16i1:
 ; RV64-BITS-512:       # %bb.0:
 ; RV64-BITS-512-NEXT:    csrr a0, vlenb
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64-BITS-512-NEXT:    vid.v v8
+; RV64-BITS-512-NEXT:    vid.v v10
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; RV64-BITS-512-NEXT:    vmv.v.i v10, 0
+; RV64-BITS-512-NEXT:    vmv.v.i v8, 0
 ; RV64-BITS-512-NEXT:    addi a0, a0, -1
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64-BITS-512-NEXT:    vrsub.vx v8, v8, a0
+; RV64-BITS-512-NEXT:    vrsub.vx v12, v10, a0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV64-BITS-512-NEXT:    vmerge.vim v10, v10, 1, v0
+; RV64-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV64-BITS-512-NEXT:    vrgather.vv v13, v10, v8
-; RV64-BITS-512-NEXT:    vrgather.vv v12, v11, v8
+; RV64-BITS-512-NEXT:    vrgather.vv v11, v8, v12
+; RV64-BITS-512-NEXT:    vrgather.vv v10, v9, v12
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV64-BITS-512-NEXT:    vmsne.vi v0, v12, 0
+; RV64-BITS-512-NEXT:    vmsne.vi v0, v10, 0
 ; RV64-BITS-512-NEXT:    ret
   %res = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
   ret <vscale x 16 x i1> %res
@@ -409,14 +409,14 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV32-BITS-UNKNOWN:       # %bb.0:
 ; RV32-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vid.v v8
+; RV32-BITS-UNKNOWN-NEXT:    vid.v v12
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v12, 0
+; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
 ; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vrsub.vx v16, v8, a0
+; RV32-BITS-UNKNOWN-NEXT:    vrsub.vx v16, v12, a0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v12, v8, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v12, v16
 ; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v10, v13, v16
@@ -430,14 +430,14 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV32-BITS-256:       # %bb.0:
 ; RV32-BITS-256-NEXT:    csrr a0, vlenb
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32-BITS-256-NEXT:    vid.v v8
+; RV32-BITS-256-NEXT:    vid.v v12
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; RV32-BITS-256-NEXT:    vmv.v.i v12, 0
+; RV32-BITS-256-NEXT:    vmv.v.i v8, 0
 ; RV32-BITS-256-NEXT:    addi a0, a0, -1
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32-BITS-256-NEXT:    vrsub.vx v16, v8, a0
+; RV32-BITS-256-NEXT:    vrsub.vx v16, v12, a0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV32-BITS-256-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV32-BITS-256-NEXT:    vmerge.vim v12, v8, 1, v0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-256-NEXT:    vrgather.vv v11, v12, v16
 ; RV32-BITS-256-NEXT:    vrgather.vv v10, v13, v16
@@ -451,14 +451,14 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV32-BITS-512:       # %bb.0:
 ; RV32-BITS-512-NEXT:    csrr a0, vlenb
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32-BITS-512-NEXT:    vid.v v8
+; RV32-BITS-512-NEXT:    vid.v v12
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; RV32-BITS-512-NEXT:    vmv.v.i v12, 0
+; RV32-BITS-512-NEXT:    vmv.v.i v8, 0
 ; RV32-BITS-512-NEXT:    addi a0, a0, -1
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32-BITS-512-NEXT:    vrsub.vx v16, v8, a0
+; RV32-BITS-512-NEXT:    vrsub.vx v16, v12, a0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV32-BITS-512-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV32-BITS-512-NEXT:    vmerge.vim v12, v8, 1, v0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-512-NEXT:    vrgather.vv v11, v12, v16
 ; RV32-BITS-512-NEXT:    vrgather.vv v10, v13, v16
@@ -472,14 +472,14 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV64-BITS-UNKNOWN:       # %bb.0:
 ; RV64-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vid.v v8
+; RV64-BITS-UNKNOWN-NEXT:    vid.v v12
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v12, 0
+; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
 ; RV64-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vrsub.vx v16, v8, a0
+; RV64-BITS-UNKNOWN-NEXT:    vrsub.vx v16, v12, a0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v12, v8, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v12, v16
 ; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v10, v13, v16
@@ -493,14 +493,14 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV64-BITS-256:       # %bb.0:
 ; RV64-BITS-256-NEXT:    csrr a0, vlenb
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64-BITS-256-NEXT:    vid.v v8
+; RV64-BITS-256-NEXT:    vid.v v12
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; RV64-BITS-256-NEXT:    vmv.v.i v12, 0
+; RV64-BITS-256-NEXT:    vmv.v.i v8, 0
 ; RV64-BITS-256-NEXT:    addi a0, a0, -1
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64-BITS-256-NEXT:    vrsub.vx v16, v8, a0
+; RV64-BITS-256-NEXT:    vrsub.vx v16, v12, a0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV64-BITS-256-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV64-BITS-256-NEXT:    vmerge.vim v12, v8, 1, v0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-256-NEXT:    vrgather.vv v11, v12, v16
 ; RV64-BITS-256-NEXT:    vrgather.vv v10, v13, v16
@@ -514,14 +514,14 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV64-BITS-512:       # %bb.0:
 ; RV64-BITS-512-NEXT:    csrr a0, vlenb
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64-BITS-512-NEXT:    vid.v v8
+; RV64-BITS-512-NEXT:    vid.v v12
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; RV64-BITS-512-NEXT:    vmv.v.i v12, 0
+; RV64-BITS-512-NEXT:    vmv.v.i v8, 0
 ; RV64-BITS-512-NEXT:    addi a0, a0, -1
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64-BITS-512-NEXT:    vrsub.vx v16, v8, a0
+; RV64-BITS-512-NEXT:    vrsub.vx v16, v12, a0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV64-BITS-512-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV64-BITS-512-NEXT:    vmerge.vim v12, v8, 1, v0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-512-NEXT:    vrgather.vv v11, v12, v16
 ; RV64-BITS-512-NEXT:    vrgather.vv v10, v13, v16
@@ -539,23 +539,23 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV32-BITS-UNKNOWN:       # %bb.0:
 ; RV32-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vid.v v8
+; RV32-BITS-UNKNOWN-NEXT:    vid.v v16
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v24, 0
+; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
 ; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vrsub.vx v16, v8, a0
+; RV32-BITS-UNKNOWN-NEXT:    vrsub.vx v24, v16, a0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v24, v24, 1, v0
+; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v16, v8, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v15, v24, v16
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v14, v25, v16
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v13, v26, v16
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v12, v27, v16
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v28, v16
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v10, v29, v16
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v9, v30, v16
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v8, v31, v16
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v15, v16, v24
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v14, v17, v24
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v13, v18, v24
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v12, v19, v24
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v20, v24
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v10, v21, v24
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v9, v22, v24
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v8, v23, v24
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-BITS-UNKNOWN-NEXT:    ret
@@ -564,14 +564,14 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV32-BITS-256:       # %bb.0:
 ; RV32-BITS-256-NEXT:    csrr a0, vlenb
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32-BITS-256-NEXT:    vid.v v8
+; RV32-BITS-256-NEXT:    vid.v v16
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; RV32-BITS-256-NEXT:    vmv.v.i v16, 0
+; RV32-BITS-256-NEXT:    vmv.v.i v8, 0
 ; RV32-BITS-256-NEXT:    addi a0, a0, -1
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32-BITS-256-NEXT:    vrsub.vx v24, v8, a0
+; RV32-BITS-256-NEXT:    vrsub.vx v24, v16, a0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV32-BITS-256-NEXT:    vmerge.vim v16, v16, 1, v0
+; RV32-BITS-256-NEXT:    vmerge.vim v16, v8, 1, v0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-256-NEXT:    vrgather.vv v15, v16, v24
 ; RV32-BITS-256-NEXT:    vrgather.vv v14, v17, v24
@@ -589,14 +589,14 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV32-BITS-512:       # %bb.0:
 ; RV32-BITS-512-NEXT:    csrr a0, vlenb
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32-BITS-512-NEXT:    vid.v v8
+; RV32-BITS-512-NEXT:    vid.v v16
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; RV32-BITS-512-NEXT:    vmv.v.i v16, 0
+; RV32-BITS-512-NEXT:    vmv.v.i v8, 0
 ; RV32-BITS-512-NEXT:    addi a0, a0, -1
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV32-BITS-512-NEXT:    vrsub.vx v24, v8, a0
+; RV32-BITS-512-NEXT:    vrsub.vx v24, v16, a0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV32-BITS-512-NEXT:    vmerge.vim v16, v16, 1, v0
+; RV32-BITS-512-NEXT:    vmerge.vim v16, v8, 1, v0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-512-NEXT:    vrgather.vv v15, v16, v24
 ; RV32-BITS-512-NEXT:    vrgather.vv v14, v17, v24
@@ -614,23 +614,23 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV64-BITS-UNKNOWN:       # %bb.0:
 ; RV64-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vid.v v8
+; RV64-BITS-UNKNOWN-NEXT:    vid.v v16
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v24, 0
+; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
 ; RV64-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vrsub.vx v16, v8, a0
+; RV64-BITS-UNKNOWN-NEXT:    vrsub.vx v24, v16, a0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v24, v24, 1, v0
+; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v16, v8, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v15, v24, v16
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v14, v25, v16
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v13, v26, v16
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v12, v27, v16
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v28, v16
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v10, v29, v16
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v9, v30, v16
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v8, v31, v16
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v15, v16, v24
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v14, v17, v24
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v13, v18, v24
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v12, v19, v24
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v20, v24
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v10, v21, v24
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v9, v22, v24
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v8, v23, v24
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-UNKNOWN-NEXT:    ret
@@ -639,14 +639,14 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV64-BITS-256:       # %bb.0:
 ; RV64-BITS-256-NEXT:    csrr a0, vlenb
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64-BITS-256-NEXT:    vid.v v8
+; RV64-BITS-256-NEXT:    vid.v v16
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; RV64-BITS-256-NEXT:    vmv.v.i v16, 0
+; RV64-BITS-256-NEXT:    vmv.v.i v8, 0
 ; RV64-BITS-256-NEXT:    addi a0, a0, -1
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64-BITS-256-NEXT:    vrsub.vx v24, v8, a0
+; RV64-BITS-256-NEXT:    vrsub.vx v24, v16, a0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV64-BITS-256-NEXT:    vmerge.vim v16, v16, 1, v0
+; RV64-BITS-256-NEXT:    vmerge.vim v16, v8, 1, v0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-256-NEXT:    vrgather.vv v15, v16, v24
 ; RV64-BITS-256-NEXT:    vrgather.vv v14, v17, v24
@@ -664,14 +664,14 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV64-BITS-512:       # %bb.0:
 ; RV64-BITS-512-NEXT:    csrr a0, vlenb
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64-BITS-512-NEXT:    vid.v v8
+; RV64-BITS-512-NEXT:    vid.v v16
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; RV64-BITS-512-NEXT:    vmv.v.i v16, 0
+; RV64-BITS-512-NEXT:    vmv.v.i v8, 0
 ; RV64-BITS-512-NEXT:    addi a0, a0, -1
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; RV64-BITS-512-NEXT:    vrsub.vx v24, v8, a0
+; RV64-BITS-512-NEXT:    vrsub.vx v24, v16, a0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV64-BITS-512-NEXT:    vmerge.vim v16, v16, 1, v0
+; RV64-BITS-512-NEXT:    vmerge.vim v16, v8, 1, v0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-512-NEXT:    vrgather.vv v15, v16, v24
 ; RV64-BITS-512-NEXT:    vrgather.vv v14, v17, v24
@@ -1969,18 +1969,18 @@ define <vscale x 6 x i64> @reverse_nxv6i64(<vscale x 6 x i64> %a) {
 ; CHECK-NEXT:    vid.v v16
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vrsub.vx v22, v16, a0
-; CHECK-NEXT:    vrgather.vv v21, v10, v22
-; CHECK-NEXT:    vrgather.vv v19, v12, v22
-; CHECK-NEXT:    vrgather.vv v18, v13, v22
-; CHECK-NEXT:    vrgather.vv v20, v11, v22
+; CHECK-NEXT:    vrsub.vx v24, v16, a0
+; CHECK-NEXT:    vrgather.vv v21, v10, v24
+; CHECK-NEXT:    vrgather.vv v19, v12, v24
+; CHECK-NEXT:    vrgather.vv v18, v13, v24
+; CHECK-NEXT:    vrgather.vv v20, v11, v24
 ; CHECK-NEXT:    vmv2r.v v16, v18
 ; CHECK-NEXT:    vmv2r.v v18, v20
-; CHECK-NEXT:    vrgather.vv v31, v8, v22
-; CHECK-NEXT:    vrgather.vv v30, v9, v22
-; CHECK-NEXT:    vrgather.vv v9, v14, v22
-; CHECK-NEXT:    vrgather.vv v8, v15, v22
-; CHECK-NEXT:    vmv2r.v v20, v30
+; CHECK-NEXT:    vrgather.vv v23, v8, v24
+; CHECK-NEXT:    vrgather.vv v22, v9, v24
+; CHECK-NEXT:    vrgather.vv v9, v14, v24
+; CHECK-NEXT:    vrgather.vv v8, v15, v24
+; CHECK-NEXT:    vmv2r.v v20, v22
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
   %res = call <vscale x 6 x i64> @llvm.vector.reverse.nxv6i64(<vscale x 6 x i64> %a)
@@ -2007,23 +2007,23 @@ define <vscale x 12 x i64> @reverse_nxv12i64(<vscale x 12 x i64> %a) {
 ; RV32-NEXT:    vid.v v20
 ; RV32-NEXT:    srli a1, a0, 3
 ; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    vrsub.vx v20, v20, a1
-; RV32-NEXT:    vrgather.vv v31, v12, v20
-; RV32-NEXT:    vrgather.vv v7, v8, v20
-; RV32-NEXT:    vrgather.vv v30, v13, v20
-; RV32-NEXT:    vrgather.vv v6, v9, v20
-; RV32-NEXT:    vrgather.vv v29, v14, v20
-; RV32-NEXT:    vrgather.vv v5, v10, v20
-; RV32-NEXT:    vrgather.vv v28, v15, v20
-; RV32-NEXT:    vrgather.vv v4, v11, v20
+; RV32-NEXT:    vrsub.vx v7, v20, a1
+; RV32-NEXT:    vrgather.vv v31, v12, v7
+; RV32-NEXT:    vrgather.vv v23, v8, v7
+; RV32-NEXT:    vrgather.vv v30, v13, v7
+; RV32-NEXT:    vrgather.vv v22, v9, v7
+; RV32-NEXT:    vrgather.vv v29, v14, v7
+; RV32-NEXT:    vrgather.vv v21, v10, v7
+; RV32-NEXT:    vrgather.vv v28, v15, v7
+; RV32-NEXT:    vrgather.vv v20, v11, v7
 ; RV32-NEXT:    addi a1, sp, 64
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    vrgather.vv v27, v16, v20
-; RV32-NEXT:    vs4r.v v4, (a0)
-; RV32-NEXT:    vrgather.vv v26, v17, v20
-; RV32-NEXT:    vrgather.vv v25, v18, v20
-; RV32-NEXT:    vrgather.vv v24, v19, v20
+; RV32-NEXT:    vrgather.vv v27, v16, v7
+; RV32-NEXT:    vs4r.v v20, (a0)
+; RV32-NEXT:    vrgather.vv v26, v17, v7
+; RV32-NEXT:    vrgather.vv v25, v18, v7
+; RV32-NEXT:    vrgather.vv v24, v19, v7
 ; RV32-NEXT:    vs8r.v v24, (a1)
 ; RV32-NEXT:    vl8re64.v v16, (a0)
 ; RV32-NEXT:    vl8re64.v v8, (a1)
@@ -2056,23 +2056,23 @@ define <vscale x 12 x i64> @reverse_nxv12i64(<vscale x 12 x i64> %a) {
 ; RV64-NEXT:    vid.v v20
 ; RV64-NEXT:    srli a1, a0, 3
 ; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    vrsub.vx v20, v20, a1
-; RV64-NEXT:    vrgather.vv v31, v12, v20
-; RV64-NEXT:    vrgather.vv v7, v8, v20
-; RV64-NEXT:    vrgather.vv v30, v13, v20
-; RV64-NEXT:    vrgather.vv v6, v9, v20
-; RV64-NEXT:    vrgather.vv v29, v14, v20
-; RV64-NEXT:    vrgather.vv v5, v10, v20
-; RV64-NEXT:    vrgather.vv v28, v15, v20
-; RV64-NEXT:    vrgather.vv v4, v11, v20
+; RV64-NEXT:    vrsub.vx v7, v20, a1
+; RV64-NEXT:    vrgather.vv v31, v12, v7
+; RV64-NEXT:    vrgather.vv v23, v8, v7
+; RV64-NEXT:    vrgather.vv v30, v13, v7
+; RV64-NEXT:    vrgather.vv v22, v9, v7
+; RV64-NEXT:    vrgather.vv v29, v14, v7
+; RV64-NEXT:    vrgather.vv v21, v10, v7
+; RV64-NEXT:    vrgather.vv v28, v15, v7
+; RV64-NEXT:    vrgather.vv v20, v11, v7
 ; RV64-NEXT:    addi a1, sp, 64
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    vrgather.vv v27, v16, v20
-; RV64-NEXT:    vs4r.v v4, (a0)
-; RV64-NEXT:    vrgather.vv v26, v17, v20
-; RV64-NEXT:    vrgather.vv v25, v18, v20
-; RV64-NEXT:    vrgather.vv v24, v19, v20
+; RV64-NEXT:    vrgather.vv v27, v16, v7
+; RV64-NEXT:    vs4r.v v20, (a0)
+; RV64-NEXT:    vrgather.vv v26, v17, v7
+; RV64-NEXT:    vrgather.vv v25, v18, v7
+; RV64-NEXT:    vrgather.vv v24, v19, v7
 ; RV64-NEXT:    vs8r.v v24, (a1)
 ; RV64-NEXT:    vl8re64.v v16, (a0)
 ; RV64-NEXT:    vl8re64.v v8, (a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/narrow-shift-extend.ll b/llvm/test/CodeGen/RISCV/rvv/narrow-shift-extend.ll
index 53f8a2503354c..557e809f4f9a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/narrow-shift-extend.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/narrow-shift-extend.ll
@@ -121,10 +121,10 @@ define <vscale x 4 x i32> @test_vloxei6(ptr %ptr, <vscale x 4 x i7> %offset, i64
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    li a2, 127
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vand.vx v12, v8, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vzext.vf8 v12, v8
-; CHECK-NEXT:    vsll.vi v12, v12, 4
+; CHECK-NEXT:    vzext.vf8 v8, v12
+; CHECK-NEXT:    vsll.vi v12, v8, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vloxei64.v v8, (a0), v12
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr61561.ll b/llvm/test/CodeGen/RISCV/rvv/pr61561.ll
index 6b08c4409fb63..948df1948ccef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr61561.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr61561.ll
@@ -4,23 +4,23 @@
 define <vscale x 4 x i8> @foo(ptr %p) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v10, (a0)
 ; CHECK-NEXT:    lui a0, 4
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    li a0, 248
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vsll.vi v8, v8, 3
+; CHECK-NEXT:    vsll.vi v12, v10, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vand.vx v8, v12, a0
+; CHECK-NEXT:    vzext.vf2 v10, v12
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -361
-; CHECK-NEXT:    vmacc.vx v10, a0, v8
+; CHECK-NEXT:    vmacc.vx v8, a0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v10, 15
+; CHECK-NEXT:    vnsrl.wi v10, v8, 15
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-NEXT:    ret
   %i13 = load <vscale x 4 x i16>, ptr %p, align 2
   %i14 = zext <vscale x 4 x i16> %i13 to <vscale x 4 x i32>
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
index 06a357eeaeb61..ab9849631663c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
@@ -143,13 +143,13 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:  .LBB0_12: # %for.body7.us.19
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv.s.x v16, a0
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v16, v8, 1
+; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmsne.vi v8, v16, 0
-; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vmsne.vi v16, v8, 0
+; CHECK-NEXT:    vmv.x.s a0, v16
 ; CHECK-NEXT:    snez a0, a0
 ; CHECK-NEXT:    sb a0, 0(zero)
 ; CHECK-NEXT:    li a0, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 64d4c88cbb365..722108f5282fc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1370,7 +1370,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
@@ -1378,9 +1378,10 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
@@ -1404,7 +1405,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    csrr t0, vlenb
 ; CHECK-NEXT:    slli t0, t0, 1
 ; CHECK-NEXT:    mv t1, t0
-; CHECK-NEXT:    slli t0, t0, 3
+; CHECK-NEXT:    slli t0, t0, 2
 ; CHECK-NEXT:    add t0, t0, t1
 ; CHECK-NEXT:    add t0, sp, t0
 ; CHECK-NEXT:    addi t0, t0, 16
@@ -1415,7 +1416,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    mv t0, a0
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add t0, t0, a0
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, t0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
@@ -1449,15 +1450,15 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v6, v24, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v6, v24, v16, v0.t
 ; CHECK-NEXT:    add a0, a3, a3
 ; CHECK-NEXT:    bltu a2, a5, .LBB85_4
 ; CHECK-NEXT:  # %bb.3:
@@ -1467,7 +1468,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    csrr a6, vlenb
 ; CHECK-NEXT:    slli a6, a6, 1
 ; CHECK-NEXT:    mv a7, a6
-; CHECK-NEXT:    slli a6, a6, 3
+; CHECK-NEXT:    slli a6, a6, 2
 ; CHECK-NEXT:    add a6, a6, a7
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
@@ -1478,37 +1479,28 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    and a5, a6, a5
 ; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 1
 ; CHECK-NEXT:    mv a7, a6
-; CHECK-NEXT:    slli a6, a6, 2
+; CHECK-NEXT:    slli a6, a6, 1
+; CHECK-NEXT:    add a7, a7, a6
+; CHECK-NEXT:    slli a6, a6, 3
 ; CHECK-NEXT:    add a6, a6, a7
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 1
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
 ; CHECK-NEXT:    csrr a5, vlenb
 ; CHECK-NEXT:    mv a6, a5
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    add a6, a6, a5
-; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    slli a5, a5, 2
 ; CHECK-NEXT:    add a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 1
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v4, v24, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v4, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v6, v5, a3
 ; CHECK-NEXT:    bltu a2, a4, .LBB85_6
@@ -1516,20 +1508,29 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    mv a2, a4
 ; CHECK-NEXT:  .LBB85_6:
 ; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    mv a5, a4
+; CHECK-NEXT:    slli a4, a4, 1
+; CHECK-NEXT:    add a5, a5, a4
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    add a4, a4, a5
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    mv a4, a2
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    add a4, a4, a2
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v24, v16, v0.t
+; CHECK-NEXT:    vmfeq.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v4, a3
 ; CHECK-NEXT:    add a0, a1, a1
@@ -3574,7 +3575,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
@@ -3582,9 +3583,10 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a3, a3, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, a1, a3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
@@ -3608,7 +3610,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    csrr t0, vlenb
 ; ZVFHMIN-NEXT:    slli t0, t0, 1
 ; ZVFHMIN-NEXT:    mv t1, t0
-; ZVFHMIN-NEXT:    slli t0, t0, 3
+; ZVFHMIN-NEXT:    slli t0, t0, 2
 ; ZVFHMIN-NEXT:    add t0, t0, t1
 ; ZVFHMIN-NEXT:    add t0, sp, t0
 ; ZVFHMIN-NEXT:    addi t0, t0, 16
@@ -3619,7 +3621,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    mv t0, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add t0, t0, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, t0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
@@ -3653,15 +3655,15 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v6, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v6, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    add a0, a3, a3
 ; ZVFHMIN-NEXT:    bltu a2, a5, .LBB171_4
 ; ZVFHMIN-NEXT:  # %bb.3:
@@ -3671,7 +3673,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
 ; ZVFHMIN-NEXT:    slli a6, a6, 1
 ; ZVFHMIN-NEXT:    mv a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 3
+; ZVFHMIN-NEXT:    slli a6, a6, 2
 ; ZVFHMIN-NEXT:    add a6, a6, a7
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
@@ -3682,37 +3684,28 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi a6, a6, -1
 ; ZVFHMIN-NEXT:    and a5, a6, a5
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a6, a6, 1
 ; ZVFHMIN-NEXT:    mv a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 2
+; ZVFHMIN-NEXT:    slli a6, a6, 1
+; ZVFHMIN-NEXT:    add a7, a7, a6
+; ZVFHMIN-NEXT:    slli a6, a6, 3
 ; ZVFHMIN-NEXT:    add a6, a6, a7
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    csrr a5, vlenb
 ; ZVFHMIN-NEXT:    mv a6, a5
 ; ZVFHMIN-NEXT:    slli a5, a5, 1
 ; ZVFHMIN-NEXT:    add a6, a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 3
+; ZVFHMIN-NEXT:    slli a5, a5, 2
 ; ZVFHMIN-NEXT:    add a5, a5, a6
 ; ZVFHMIN-NEXT:    add a5, sp, a5
 ; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v4, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v4, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v6, v5, a3
 ; ZVFHMIN-NEXT:    bltu a2, a4, .LBB171_6
@@ -3720,20 +3713,29 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    mv a2, a4
 ; ZVFHMIN-NEXT:  .LBB171_6:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a5, a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    mv a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v4, a3
 ; ZVFHMIN-NEXT:    add a0, a1, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index c46334fe556eb..9b794538c404e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -5750,11 +5750,11 @@ define void @sink_splat_vfwadd_vf(ptr nocapture %a, ptr nocapture %b, float %f)
 ; CHECK-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB125_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vl1re32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v10, (a0)
 ; CHECK-NEXT:    addi a1, a1, 4
 ; CHECK-NEXT:    addi a2, a2, -4
-; CHECK-NEXT:    vfwadd.vf v10, v8, fa0
-; CHECK-NEXT:    vs2r.v v10, (a0)
+; CHECK-NEXT:    vfwadd.vf v8, v10, fa0
+; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    j .LBB125_1
 entry:
@@ -5823,11 +5823,11 @@ define void @sink_splat_vfwmul_vf(ptr nocapture %a, ptr nocapture %b, float %f)
 ; CHECK-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB127_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vl1re32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v10, (a0)
 ; CHECK-NEXT:    addi a1, a1, 4
 ; CHECK-NEXT:    addi a2, a2, -4
-; CHECK-NEXT:    vfwmul.vf v10, v8, fa0
-; CHECK-NEXT:    vs2r.v v10, (a0)
+; CHECK-NEXT:    vfwmul.vf v8, v10, fa0
+; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    j .LBB127_1
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
index 16c4ade7fa9cb..ed6b7f1e6efb8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
@@ -33,11 +33,11 @@ define dso_local void @test_store1(ptr nocapture noundef writeonly %dst, ptr noc
 ; RV32-NEXT:    sltu a7, t1, a7
 ; RV32-NEXT:    xor t0, t1, a5
 ; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    vmslt.vx v10, v8, a2
-; RV32-NEXT:    vcompress.vm v12, v8, v10
-; RV32-NEXT:    vcpop.m a7, v10
+; RV32-NEXT:    vmslt.vx v12, v8, a2
+; RV32-NEXT:    vcompress.vm v10, v8, v12
+; RV32-NEXT:    vcpop.m a7, v12
 ; RV32-NEXT:    vsetvli zero, a7, e32, m2, ta, ma
-; RV32-NEXT:    vse32.v v12, (a0)
+; RV32-NEXT:    vse32.v v10, (a0)
 ; RV32-NEXT:    slli a7, a7, 2
 ; RV32-NEXT:    or t0, t0, a6
 ; RV32-NEXT:    add a0, a0, a7
@@ -95,11 +95,11 @@ define dso_local void @test_store1(ptr nocapture noundef writeonly %dst, ptr noc
 ; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a6)
 ; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vmslt.vx v10, v8, a2
-; RV64-NEXT:    vcompress.vm v12, v8, v10
-; RV64-NEXT:    vcpop.m a7, v10
+; RV64-NEXT:    vmslt.vx v12, v8, a2
+; RV64-NEXT:    vcompress.vm v10, v8, v12
+; RV64-NEXT:    vcpop.m a7, v12
 ; RV64-NEXT:    vsetvli zero, a7, e32, m2, ta, ma
-; RV64-NEXT:    vse32.v v12, (a0)
+; RV64-NEXT:    vse32.v v10, (a0)
 ; RV64-NEXT:    slli a7, a7, 2
 ; RV64-NEXT:    add a0, a0, a7
 ; RV64-NEXT:    bne a6, a5, .LBB0_4
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index d2e743b17f38a..20e68b13bae10 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -161,15 +161,15 @@ define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
+; CHECK-NEXT:    vslidedown.vi v12, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v8, 4
+; CHECK-NEXT:    vslidedown.vi v10, v8, 4
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v9, v12
+; CHECK-NEXT:    vmv1r.v v9, v10
 ; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vlseg3e32.v v8, (a0)
@@ -196,22 +196,22 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterle
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 6
-; CHECK-NEXT:    vslidedown.vi v11, v8, 4
-; CHECK-NEXT:    vslidedown.vi v12, v8, 2
+; CHECK-NEXT:    vslidedown.vi v12, v8, 6
+; CHECK-NEXT:    vslidedown.vi v13, v8, 4
+; CHECK-NEXT:    vslidedown.vi v14, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v14, v8, 8
+; CHECK-NEXT:    vslidedown.vi v10, v8, 8
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v11, v10, a1
-; CHECK-NEXT:    vslideup.vx v8, v12, a1
+; CHECK-NEXT:    vslideup.vx v13, v12, a1
+; CHECK-NEXT:    vslideup.vx v8, v14, a1
 ; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v11, a0
+; CHECK-NEXT:    vslideup.vx v8, v13, a0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v9, v14
+; CHECK-NEXT:    vmv1r.v v9, v10
 ; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vlseg5e16.v v8, (a0)
@@ -538,15 +538,15 @@ define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v632_v2f32(
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
+; CHECK-NEXT:    vslidedown.vi v12, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v8, 4
+; CHECK-NEXT:    vslidedown.vi v10, v8, 4
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v9, v12
+; CHECK-NEXT:    vmv1r.v v9, v10
 ; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vlseg3e32.v v8, (a0)
@@ -573,22 +573,22 @@ define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_dein
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 6
-; CHECK-NEXT:    vslidedown.vi v11, v8, 4
-; CHECK-NEXT:    vslidedown.vi v12, v8, 2
+; CHECK-NEXT:    vslidedown.vi v12, v8, 6
+; CHECK-NEXT:    vslidedown.vi v13, v8, 4
+; CHECK-NEXT:    vslidedown.vi v14, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v14, v8, 8
+; CHECK-NEXT:    vslidedown.vi v10, v8, 8
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v11, v10, a1
-; CHECK-NEXT:    vslideup.vx v8, v12, a1
+; CHECK-NEXT:    vslideup.vx v13, v12, a1
+; CHECK-NEXT:    vslideup.vx v8, v14, a1
 ; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v11, a0
+; CHECK-NEXT:    vslideup.vx v8, v13, a0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v9, v14
+; CHECK-NEXT:    vmv1r.v v9, v10
 ; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vlseg5e16.v v8, (a0)
@@ -615,28 +615,28 @@ define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>,
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 3
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
-; CHECK-NEXT:    vslidedown.vi v11, v8, 1
-; CHECK-NEXT:    vmv1r.v v12, v8
-; CHECK-NEXT:    vslidedown.vi v14, v8, 5
-; CHECK-NEXT:    vslidedown.vi v15, v8, 6
+; CHECK-NEXT:    vslidedown.vi v11, v8, 2
+; CHECK-NEXT:    vslidedown.vi v12, v8, 1
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vslidedown.vi v13, v8, 5
+; CHECK-NEXT:    vslidedown.vi v14, v8, 6
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    add a3, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    vslideup.vx v12, v11, a1
+; CHECK-NEXT:    vslideup.vx v11, v9, a1
+; CHECK-NEXT:    vslideup.vx v10, v12, a1
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v12, v10, a0
+; CHECK-NEXT:    vslideup.vx v10, v11, a0
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v13, v8, 4
+; CHECK-NEXT:    vslidedown.vi v11, v8, 4
 ; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v13, v14, a1
+; CHECK-NEXT:    vslideup.vx v11, v13, a1
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v13, v15, a0
+; CHECK-NEXT:    vslideup.vx v11, v14, a0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs2r.v v12, (a0)
+; CHECK-NEXT:    vs2r.v v10, (a0)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 14f306da21dba..745c2cd72a6f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -16,14 +16,14 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_load_nxv16i
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmerge.vim v10, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
-; CHECK-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK-NEXT:    vnsrl.wi v10, v12, 8
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vmsne.vi v8, v10, 0
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vnsrl.wi v14, v8, 8
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
+; CHECK-NEXT:    vmsne.vi v8, v14, 0
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 32 x i1>, ptr %p
   %deinterleaved.results = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index ff55c09a0707b..319dfb037c15a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -10,18 +10,18 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
 ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v12, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v14, v8, 1, v0
-; CHECK-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK-NEXT:    vnsrl.wi v10, v12, 8
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vmsne.vi v8, v10, 0
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vnsrl.wi v14, v8, 8
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
+; CHECK-NEXT:    vmsne.vi v8, v14, 0
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
 ret {<vscale x 16 x i1>, <vscale x 16 x i1>} %retval
@@ -74,12 +74,12 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a0
 ; CHECK-NEXT:    li a0, 170
-; CHECK-NEXT:    vmv.v.x v17, a0
+; CHECK-NEXT:    vmv.v.x v20, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vcompress.vm v12, v8, v16
-; CHECK-NEXT:    vcompress.vm v20, v8, v17
+; CHECK-NEXT:    vcompress.vm v16, v8, v20
 ; CHECK-NEXT:    vmv2r.v v8, v12
-; CHECK-NEXT:    vmv2r.v v10, v20
+; CHECK-NEXT:    vmv2r.v v10, v16
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
 ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
@@ -92,12 +92,12 @@ define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v24, a0
 ; CHECK-NEXT:    li a0, 170
-; CHECK-NEXT:    vmv.v.x v25, a0
+; CHECK-NEXT:    vmv.v.x v7, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vcompress.vm v16, v8, v24
-; CHECK-NEXT:    vcompress.vm v0, v8, v25
+; CHECK-NEXT:    vcompress.vm v24, v8, v7
 ; CHECK-NEXT:    vmv4r.v v8, v16
-; CHECK-NEXT:    vmv4r.v v12, v0
+; CHECK-NEXT:    vmv4r.v v12, v24
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
 ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
@@ -336,12 +336,12 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a0
 ; CHECK-NEXT:    li a0, 170
-; CHECK-NEXT:    vmv.v.x v17, a0
+; CHECK-NEXT:    vmv.v.x v20, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vcompress.vm v12, v8, v16
-; CHECK-NEXT:    vcompress.vm v20, v8, v17
+; CHECK-NEXT:    vcompress.vm v16, v8, v20
 ; CHECK-NEXT:    vmv2r.v v8, v12
-; CHECK-NEXT:    vmv2r.v v10, v20
+; CHECK-NEXT:    vmv2r.v v10, v16
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
 ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
@@ -574,9 +574,9 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    srli a2, a0, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
@@ -584,46 +584,46 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v9, a2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v20, v12, 1, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v9, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v26, v13
+; CHECK-NEXT:    vmerge.vim v14, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v10, v15
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v27, v8
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v11, v12
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v24, v21
+; CHECK-NEXT:    vmv1r.v v8, v21
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v25, v12
+; CHECK-NEXT:    vmv1r.v v9, v14
 ; CHECK-NEXT:    vs8r.v v16, (a0)
-; CHECK-NEXT:    vmv1r.v v28, v9
-; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    vmv1r.v v12, v13
+; CHECK-NEXT:    vs8r.v v8, (a1)
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v8, (a0)
-; CHECK-NEXT:    vlseg5e8.v v14, (a1)
-; CHECK-NEXT:    vmv2r.v v20, v8
-; CHECK-NEXT:    vmv2r.v v22, v10
-; CHECK-NEXT:    vmv1r.v v21, v14
+; CHECK-NEXT:    vlseg5e8.v v12, (a0)
+; CHECK-NEXT:    vlseg5e8.v v18, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v10, v14
+; CHECK-NEXT:    vmv1r.v v9, v18
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v20, 0
-; CHECK-NEXT:    vmv1r.v v14, v9
-; CHECK-NEXT:    vmsne.vi v8, v14, 0
-; CHECK-NEXT:    vmv1r.v v23, v16
-; CHECK-NEXT:    vmsne.vi v9, v22, 0
-; CHECK-NEXT:    vmv1r.v v16, v11
-; CHECK-NEXT:    vmsne.vi v10, v16, 0
-; CHECK-NEXT:    vmv1r.v v13, v18
-; CHECK-NEXT:    vmsne.vi v11, v12, 0
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmv1r.v v18, v13
+; CHECK-NEXT:    vmsne.vi v8, v18, 0
+; CHECK-NEXT:    vmv1r.v v11, v20
+; CHECK-NEXT:    vmsne.vi v9, v10, 0
+; CHECK-NEXT:    vmv1r.v v20, v15
+; CHECK-NEXT:    vmsne.vi v10, v20, 0
+; CHECK-NEXT:    vmv1r.v v17, v22
+; CHECK-NEXT:    vmsne.vi v11, v16, 0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
@@ -806,9 +806,9 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    srli a2, a0, 1
 ; CHECK-NEXT:    srli a3, a0, 3
@@ -816,40 +816,40 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a1
 ; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
 ; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v9, a2
 ; CHECK-NEXT:    sub a0, a0, a3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v20, v12, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v9, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v22, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v14, v12, 1, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a1
-; CHECK-NEXT:    vmv1r.v v26, v13
+; CHECK-NEXT:    vmv1r.v v10, v15
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v27, v14
+; CHECK-NEXT:    vmerge.vim v24, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v11, v24
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a2
-; CHECK-NEXT:    vmv1r.v v24, v23
+; CHECK-NEXT:    vmv1r.v v8, v23
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v25, v12
+; CHECK-NEXT:    vmv1r.v v9, v14
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v28, v15
+; CHECK-NEXT:    vmerge.vim v14, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v12, v25
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v29, v8
+; CHECK-NEXT:    vmv1r.v v13, v14
 ; CHECK-NEXT:    vs8r.v v16, (a0)
-; CHECK-NEXT:    vmv1r.v v30, v9
-; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    vmv1r.v v14, v15
+; CHECK-NEXT:    vs8r.v v8, (a1)
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vlseg7e8.v v8, (a0)
 ; CHECK-NEXT:    vlseg7e8.v v16, (a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index 1e4cb06480163..e2f81acc9cf99 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -49,18 +49,20 @@ define <16 x i16> @vector_interleave_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: vector_interleave_v16i16_v8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwaddu.vv v8, v11, v10
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v10
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_v16i16_v8i16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVBB-NEXT:    vwsll.vi v10, v9, 16
-; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
-; ZVBB-NEXT:    vmv2r.v v8, v10
+; ZVBB-NEXT:    vmv1r.v v10, v9
+; ZVBB-NEXT:    vmv1r.v v11, v8
+; ZVBB-NEXT:    vwsll.vi v8, v10, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
 ; ZVBB-NEXT:    ret
 	   %res = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
 	   ret <16 x i16> %res
@@ -70,19 +72,21 @@ define <8 x i32> @vector_interleave_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vector_interleave_v8i32_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwaddu.vv v8, v11, v10
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v10
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_v8i32_v4i32:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    li a0, 32
 ; ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; ZVBB-NEXT:    vwsll.vx v10, v9, a0
-; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
-; ZVBB-NEXT:    vmv2r.v v8, v10
+; ZVBB-NEXT:    vmv1r.v v10, v9
+; ZVBB-NEXT:    vmv1r.v v11, v8
+; ZVBB-NEXT:    li a0, 32
+; ZVBB-NEXT:    vwsll.vx v8, v10, a0
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
 ; ZVBB-NEXT:    ret
 	   %res = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
 	   ret <8 x i32> %res
@@ -205,18 +209,18 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
 ; CHECK-NEXT:    vsseg5e16.v v8, (a0)
 ; CHECK-NEXT:    add a4, a3, a1
 ; CHECK-NEXT:    vle16.v v9, (a2)
-; CHECK-NEXT:    vle16.v v10, (a4)
-; CHECK-NEXT:    vle16.v v11, (a3)
+; CHECK-NEXT:    vle16.v v11, (a4)
+; CHECK-NEXT:    vle16.v v12, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    add a1, a4, a1
-; CHECK-NEXT:    vle16.v v12, (a1)
+; CHECK-NEXT:    vle16.v v10, (a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v11, v10, 2
+; CHECK-NEXT:    vslideup.vi v12, v11, 2
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 4
+; CHECK-NEXT:    vslideup.vi v8, v12, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vslideup.vi v8, v10, 8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
@@ -242,18 +246,18 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
 ; ZVBB-NEXT:    vsseg5e16.v v8, (a0)
 ; ZVBB-NEXT:    add a4, a3, a1
 ; ZVBB-NEXT:    vle16.v v9, (a2)
-; ZVBB-NEXT:    vle16.v v10, (a4)
-; ZVBB-NEXT:    vle16.v v11, (a3)
+; ZVBB-NEXT:    vle16.v v11, (a4)
+; ZVBB-NEXT:    vle16.v v12, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
 ; ZVBB-NEXT:    add a1, a4, a1
-; ZVBB-NEXT:    vle16.v v12, (a1)
+; ZVBB-NEXT:    vle16.v v10, (a1)
 ; ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVBB-NEXT:    vslideup.vi v11, v10, 2
+; ZVBB-NEXT:    vslideup.vi v12, v11, 2
 ; ZVBB-NEXT:    vslideup.vi v8, v9, 2
 ; ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vi v8, v11, 4
+; ZVBB-NEXT:    vslideup.vi v8, v12, 4
 ; ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVBB-NEXT:    vslideup.vi v8, v12, 8
+; ZVBB-NEXT:    vslideup.vi v8, v10, 8
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
@@ -424,18 +428,20 @@ define <16 x half> @vector_interleave_v16f16_v8f16(<8 x half> %a, <8 x half> %b)
 ; CHECK-LABEL: vector_interleave_v16f16_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwaddu.vv v8, v11, v10
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v10
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_v16f16_v8f16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVBB-NEXT:    vwsll.vi v10, v9, 16
-; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
-; ZVBB-NEXT:    vmv2r.v v8, v10
+; ZVBB-NEXT:    vmv1r.v v10, v9
+; ZVBB-NEXT:    vmv1r.v v11, v8
+; ZVBB-NEXT:    vwsll.vi v8, v10, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
 ; ZVBB-NEXT:    ret
 	   %res = call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
 	   ret <16 x half> %res
@@ -445,19 +451,21 @@ define <8 x float> @vector_interleave_v8f32_v4f32(<4 x float> %a, <4 x float> %b
 ; CHECK-LABEL: vector_interleave_v8f32_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwaddu.vv v8, v11, v10
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v10
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_v8f32_v4f32:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    li a0, 32
 ; ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; ZVBB-NEXT:    vwsll.vx v10, v9, a0
-; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
-; ZVBB-NEXT:    vmv2r.v v8, v10
+; ZVBB-NEXT:    vmv1r.v v10, v9
+; ZVBB-NEXT:    vmv1r.v v11, v8
+; ZVBB-NEXT:    li a0, 32
+; ZVBB-NEXT:    vwsll.vx v8, v10, a0
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
 ; ZVBB-NEXT:    ret
 	   %res = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
 	   ret <8 x float> %res
@@ -580,18 +588,18 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b
 ; CHECK-NEXT:    vsseg5e16.v v8, (a0)
 ; CHECK-NEXT:    add a4, a3, a1
 ; CHECK-NEXT:    vle16.v v9, (a2)
-; CHECK-NEXT:    vle16.v v10, (a4)
-; CHECK-NEXT:    vle16.v v11, (a3)
+; CHECK-NEXT:    vle16.v v11, (a4)
+; CHECK-NEXT:    vle16.v v12, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    add a1, a4, a1
-; CHECK-NEXT:    vle16.v v12, (a1)
+; CHECK-NEXT:    vle16.v v10, (a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v11, v10, 2
+; CHECK-NEXT:    vslideup.vi v12, v11, 2
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 4
+; CHECK-NEXT:    vslideup.vi v8, v12, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vslideup.vi v8, v10, 8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
@@ -617,18 +625,18 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b
 ; ZVBB-NEXT:    vsseg5e16.v v8, (a0)
 ; ZVBB-NEXT:    add a4, a3, a1
 ; ZVBB-NEXT:    vle16.v v9, (a2)
-; ZVBB-NEXT:    vle16.v v10, (a4)
-; ZVBB-NEXT:    vle16.v v11, (a3)
+; ZVBB-NEXT:    vle16.v v11, (a4)
+; ZVBB-NEXT:    vle16.v v12, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
 ; ZVBB-NEXT:    add a1, a4, a1
-; ZVBB-NEXT:    vle16.v v12, (a1)
+; ZVBB-NEXT:    vle16.v v10, (a1)
 ; ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVBB-NEXT:    vslideup.vi v11, v10, 2
+; ZVBB-NEXT:    vslideup.vi v12, v11, 2
 ; ZVBB-NEXT:    vslideup.vi v8, v9, 2
 ; ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vi v8, v11, 4
+; ZVBB-NEXT:    vslideup.vi v8, v12, 4
 ; ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVBB-NEXT:    vslideup.vi v8, v12, 8
+; ZVBB-NEXT:    vslideup.vi v8, v10, 8
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index 8f6365d35f885..b5eb312bf5e18 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -17,17 +17,17 @@ define void @vector_interleave_store_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vwaddu.vv v16, v8, v12
-; CHECK-NEXT:    vwmaccu.vx v16, a1, v12
-; CHECK-NEXT:    vmsne.vi v8, v18, 0
-; CHECK-NEXT:    vmsne.vi v9, v16, 0
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
+; CHECK-NEXT:    vwmaccu.vx v8, a1, v12
+; CHECK-NEXT:    vmsne.vi v12, v10, 0
+; CHECK-NEXT:    vmsne.vi v10, v8, 0
 ; CHECK-NEXT:    add a1, a2, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a2
+; CHECK-NEXT:    vslideup.vx v10, v12, a2
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vsm.v v9, (a0)
+; CHECK-NEXT:    vsm.v v10, (a0)
 ; CHECK-NEXT:    ret
   %res = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
   store <vscale x 32 x i1> %res, ptr %p
@@ -104,17 +104,17 @@ define void @vector_interleave_store_nxv16i64_nxv8i64(<vscale x 8 x i64> %a, <vs
 ; CHECK-NEXT:    srli a2, a1, 1
 ; CHECK-NEXT:    vmv4r.v v28, v16
 ; CHECK-NEXT:    vmv4r.v v16, v12
-; CHECK-NEXT:    vsrl.vi v8, v6, 1
-; CHECK-NEXT:    vand.vi v10, v6, 1
+; CHECK-NEXT:    vsrl.vi v4, v6, 1
+; CHECK-NEXT:    vand.vi v8, v6, 1
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vadd.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    vadd.vx v4, v4, a2, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v0, v24, v8
-; CHECK-NEXT:    vrgatherei16.vv v24, v16, v8
+; CHECK-NEXT:    vrgatherei16.vv v8, v24, v4
+; CHECK-NEXT:    vrgatherei16.vv v24, v16, v4
 ; CHECK-NEXT:    vs8r.v v24, (a1)
-; CHECK-NEXT:    vs8r.v v0, (a0)
+; CHECK-NEXT:    vs8r.v v8, (a0)
 ; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
   store <vscale x 16 x i64> %res, ptr %p
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index 6aa62c2256925..a41ed52343099 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -19,15 +19,15 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    vwaddu.vv v16, v8, v12
-; CHECK-NEXT:    vwmaccu.vx v16, a0, v12
-; CHECK-NEXT:    vmsne.vi v8, v18, 0
-; CHECK-NEXT:    vmsne.vi v0, v16, 0
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v12
+; CHECK-NEXT:    vmsne.vi v12, v10, 0
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    add a0, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v8, a1
+; CHECK-NEXT:    vslideup.vx v0, v12, a1
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv32i1_nxv16i1:
@@ -57,18 +57,20 @@ define <vscale x 32 x i8> @vector_interleave_nxv32i8_nxv16i8(<vscale x 16 x i8>
 ; CHECK-LABEL: vector_interleave_nxv32i8_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vv v12, v8, v10
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v12, a0, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v12
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv32i8_nxv16i8:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; ZVBB-NEXT:    vwsll.vi v12, v10, 8
-; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
-; ZVBB-NEXT:    vmv4r.v v8, v12
+; ZVBB-NEXT:    vmv2r.v v12, v10
+; ZVBB-NEXT:    vmv2r.v v14, v8
+; ZVBB-NEXT:    vwsll.vi v8, v12, 8
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v14
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
   ret <vscale x 32 x i8> %res
@@ -78,18 +80,20 @@ define <vscale x 16 x i16> @vector_interleave_nxv16i16_nxv8i16(<vscale x 8 x i16
 ; CHECK-LABEL: vector_interleave_nxv16i16_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vv v12, v8, v10
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v12, a0, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v12
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv16i16_nxv8i16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVBB-NEXT:    vwsll.vi v12, v10, 16
-; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
-; ZVBB-NEXT:    vmv4r.v v8, v12
+; ZVBB-NEXT:    vmv2r.v v12, v10
+; ZVBB-NEXT:    vmv2r.v v14, v8
+; ZVBB-NEXT:    vwsll.vi v8, v12, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v14
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
   ret <vscale x 16 x i16> %res
@@ -99,19 +103,21 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32(<vscale x 4 x i32>
 ; CHECK-LABEL: vector_interleave_nxv8i32_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vv v12, v8, v10
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v12, a0, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v12
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv8i32_nxv4i32:
 ; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; ZVBB-NEXT:    vmv2r.v v12, v10
+; ZVBB-NEXT:    vmv2r.v v14, v8
 ; ZVBB-NEXT:    li a0, 32
-; ZVBB-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVBB-NEXT:    vwsll.vx v12, v10, a0
-; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
-; ZVBB-NEXT:    vmv4r.v v8, v12
+; ZVBB-NEXT:    vwsll.vx v8, v12, a0
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v14
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   ret <vscale x 8 x i32> %res
@@ -361,18 +367,20 @@ define <vscale x 8 x bfloat> @vector_interleave_nxv8bf16_nxv4bf16(<vscale x 4 x
 ; CHECK-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwaddu.vv v8, v11, v10
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v10
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-NEXT:    vwsll.vi v10, v9, 16
-; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
-; ZVBB-NEXT:    vmv2r.v v8, v10
+; ZVBB-NEXT:    vmv1r.v v10, v9
+; ZVBB-NEXT:    vmv1r.v v11, v8
+; ZVBB-NEXT:    vwsll.vi v8, v10, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 8 x bfloat> @llvm.vector.interleave2.nxv8bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
   ret <vscale x 8 x bfloat> %res
@@ -417,18 +425,20 @@ define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv4f16(<vscale x 4 x half
 ; CHECK-LABEL: vector_interleave_nxv8f16_nxv4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwaddu.vv v8, v11, v10
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v10
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv8f16_nxv4f16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-NEXT:    vwsll.vi v10, v9, 16
-; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
-; ZVBB-NEXT:    vmv2r.v v8, v10
+; ZVBB-NEXT:    vmv1r.v v10, v9
+; ZVBB-NEXT:    vmv1r.v v11, v8
+; ZVBB-NEXT:    vwsll.vi v8, v10, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
   ret <vscale x 8 x half> %res
@@ -438,19 +448,21 @@ define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv2f32(<vscale x 2 x flo
 ; CHECK-LABEL: vector_interleave_nxv4f32_nxv2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwaddu.vv v8, v11, v10
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v10
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv4f32_nxv2f32:
 ; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-NEXT:    vmv1r.v v10, v9
+; ZVBB-NEXT:    vmv1r.v v11, v8
 ; ZVBB-NEXT:    li a0, 32
-; ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVBB-NEXT:    vwsll.vx v10, v9, a0
-; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
-; ZVBB-NEXT:    vmv2r.v v8, v10
+; ZVBB-NEXT:    vwsll.vx v8, v10, a0
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
   ret <vscale x 4 x float> %res
@@ -460,18 +472,20 @@ define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv8bf16(<vscale x 8
 ; CHECK-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vv v12, v8, v10
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v12, a0, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v12
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVBB-NEXT:    vwsll.vi v12, v10, 16
-; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
-; ZVBB-NEXT:    vmv4r.v v8, v12
+; ZVBB-NEXT:    vmv2r.v v12, v10
+; ZVBB-NEXT:    vmv2r.v v14, v8
+; ZVBB-NEXT:    vwsll.vi v8, v12, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v14
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 16 x bfloat> @llvm.vector.interleave2.nxv16bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
   ret <vscale x 16 x bfloat> %res
@@ -481,18 +495,20 @@ define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv8f16(<vscale x 8 x ha
 ; CHECK-LABEL: vector_interleave_nxv16f16_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vv v12, v8, v10
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v12, a0, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v12
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv16f16_nxv8f16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVBB-NEXT:    vwsll.vi v12, v10, 16
-; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
-; ZVBB-NEXT:    vmv4r.v v8, v12
+; ZVBB-NEXT:    vmv2r.v v12, v10
+; ZVBB-NEXT:    vmv2r.v v14, v8
+; ZVBB-NEXT:    vwsll.vi v8, v12, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v14
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
   ret <vscale x 16 x half> %res
@@ -502,19 +518,21 @@ define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv4f32(<vscale x 4 x flo
 ; CHECK-LABEL: vector_interleave_nxv8f32_nxv4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vv v12, v8, v10
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v12, a0, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v12
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv8f32_nxv4f32:
 ; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; ZVBB-NEXT:    vmv2r.v v12, v10
+; ZVBB-NEXT:    vmv2r.v v14, v8
 ; ZVBB-NEXT:    li a0, 32
-; ZVBB-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVBB-NEXT:    vwsll.vx v12, v10, a0
-; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
-; ZVBB-NEXT:    vmv4r.v v8, v12
+; ZVBB-NEXT:    vwsll.vx v8, v12, a0
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v14
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
   ret <vscale x 8 x float> %res
@@ -684,15 +702,15 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32_poison(<vscale x 4
 ; CHECK-LABEL: vector_interleave_nxv8i32_nxv4i32_poison:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vzext.vf2 v8, v12
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv8i32_nxv4i32_poison:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; ZVBB-NEXT:    vzext.vf2 v12, v8
-; ZVBB-NEXT:    vmv.v.v v8, v12
+; ZVBB-NEXT:    vmv2r.v v12, v8
+; ZVBB-NEXT:    vzext.vf2 v8, v12
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> poison)
   ret <vscale x 8 x i32> %res
@@ -709,10 +727,10 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32_poison2(<vscale x 4
 ;
 ; ZVBB-LABEL: vector_interleave_nxv8i32_nxv4i32_poison2:
 ; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; ZVBB-NEXT:    vmv2r.v v12, v8
 ; ZVBB-NEXT:    li a0, 32
-; ZVBB-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVBB-NEXT:    vwsll.vx v12, v8, a0
-; ZVBB-NEXT:    vmv4r.v v8, v12
+; ZVBB-NEXT:    vwsll.vx v8, v12, a0
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a)
   ret <vscale x 8 x i32> %res
@@ -2062,37 +2080,37 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v14, 1, v0
+; CHECK-NEXT:    vmerge.vim v18, v14, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v22, v14, 1, v0
+; CHECK-NEXT:    vmerge.vim v16, v14, 1, v0
 ; CHECK-NEXT:    add a3, a4, a2
 ; CHECK-NEXT:    srli a1, a2, 2
 ; CHECK-NEXT:    add a5, a0, a2
-; CHECK-NEXT:    vmv4r.v v24, v16
+; CHECK-NEXT:    vmv2r.v v2, v18
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v18, v14, 1, v0
+; CHECK-NEXT:    vmerge.vim v20, v14, 1, v0
 ; CHECK-NEXT:    add a6, a3, a2
-; CHECK-NEXT:    vmv1r.v v25, v22
+; CHECK-NEXT:    vmv1r.v v3, v16
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmerge.vim v8, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v26, v18
+; CHECK-NEXT:    vmv1r.v v4, v20
 ; CHECK-NEXT:    vmv1r.v v0, v11
-; CHECK-NEXT:    vmerge.vim v20, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v27, v8
+; CHECK-NEXT:    vmerge.vim v22, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v5, v8
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vim v10, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v28, v20
-; CHECK-NEXT:    vmv1r.v v18, v23
+; CHECK-NEXT:    vmv1r.v v6, v22
+; CHECK-NEXT:    vmv1r.v v20, v17
 ; CHECK-NEXT:    add a7, a6, a2
-; CHECK-NEXT:    vmv1r.v v29, v10
-; CHECK-NEXT:    vmv1r.v v20, v9
+; CHECK-NEXT:    vmv1r.v v7, v10
+; CHECK-NEXT:    vmv1r.v v22, v9
 ; CHECK-NEXT:    vmv1r.v v0, v13
-; CHECK-NEXT:    vmerge.vim v30, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v22, v11
+; CHECK-NEXT:    vmerge.vim v8, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v24, v11
 ; CHECK-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vsseg7e8.v v24, (a4)
-; CHECK-NEXT:    vmv1r.v v23, v31
-; CHECK-NEXT:    vsseg7e8.v v17, (a0)
+; CHECK-NEXT:    vsseg7e8.v v2, (a4)
+; CHECK-NEXT:    vmv1r.v v25, v9
+; CHECK-NEXT:    vsseg7e8.v v19, (a0)
 ; CHECK-NEXT:    vl1r.v v8, (a6)
 ; CHECK-NEXT:    add a6, a7, a2
 ; CHECK-NEXT:    vl1r.v v10, (a4)
@@ -2156,37 +2174,37 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; ZVBB-NEXT:    add a0, sp, a0
 ; ZVBB-NEXT:    addi a0, a0, 16
 ; ZVBB-NEXT:    csrr a2, vlenb
-; ZVBB-NEXT:    vmerge.vim v16, v14, 1, v0
+; ZVBB-NEXT:    vmerge.vim v18, v14, 1, v0
 ; ZVBB-NEXT:    vmv1r.v v0, v8
-; ZVBB-NEXT:    vmerge.vim v22, v14, 1, v0
+; ZVBB-NEXT:    vmerge.vim v16, v14, 1, v0
 ; ZVBB-NEXT:    add a3, a4, a2
 ; ZVBB-NEXT:    srli a1, a2, 2
 ; ZVBB-NEXT:    add a5, a0, a2
-; ZVBB-NEXT:    vmv4r.v v24, v16
+; ZVBB-NEXT:    vmv2r.v v2, v18
 ; ZVBB-NEXT:    vmv1r.v v0, v9
-; ZVBB-NEXT:    vmerge.vim v18, v14, 1, v0
+; ZVBB-NEXT:    vmerge.vim v20, v14, 1, v0
 ; ZVBB-NEXT:    add a6, a3, a2
-; ZVBB-NEXT:    vmv1r.v v25, v22
+; ZVBB-NEXT:    vmv1r.v v3, v16
 ; ZVBB-NEXT:    vmv1r.v v0, v10
 ; ZVBB-NEXT:    vmerge.vim v8, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v26, v18
+; ZVBB-NEXT:    vmv1r.v v4, v20
 ; ZVBB-NEXT:    vmv1r.v v0, v11
-; ZVBB-NEXT:    vmerge.vim v20, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v27, v8
+; ZVBB-NEXT:    vmerge.vim v22, v14, 1, v0
+; ZVBB-NEXT:    vmv1r.v v5, v8
 ; ZVBB-NEXT:    vmv1r.v v0, v12
 ; ZVBB-NEXT:    vmerge.vim v10, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v28, v20
-; ZVBB-NEXT:    vmv1r.v v18, v23
+; ZVBB-NEXT:    vmv1r.v v6, v22
+; ZVBB-NEXT:    vmv1r.v v20, v17
 ; ZVBB-NEXT:    add a7, a6, a2
-; ZVBB-NEXT:    vmv1r.v v29, v10
-; ZVBB-NEXT:    vmv1r.v v20, v9
+; ZVBB-NEXT:    vmv1r.v v7, v10
+; ZVBB-NEXT:    vmv1r.v v22, v9
 ; ZVBB-NEXT:    vmv1r.v v0, v13
-; ZVBB-NEXT:    vmerge.vim v30, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v22, v11
+; ZVBB-NEXT:    vmerge.vim v8, v14, 1, v0
+; ZVBB-NEXT:    vmv1r.v v24, v11
 ; ZVBB-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; ZVBB-NEXT:    vsseg7e8.v v24, (a4)
-; ZVBB-NEXT:    vmv1r.v v23, v31
-; ZVBB-NEXT:    vsseg7e8.v v17, (a0)
+; ZVBB-NEXT:    vsseg7e8.v v2, (a4)
+; ZVBB-NEXT:    vmv1r.v v25, v9
+; ZVBB-NEXT:    vsseg7e8.v v19, (a0)
 ; ZVBB-NEXT:    vl1r.v v8, (a6)
 ; ZVBB-NEXT:    add a6, a7, a2
 ; ZVBB-NEXT:    vl1r.v v10, (a4)
@@ -2281,39 +2299,39 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; RV32-NEXT:    vsseg7e8.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
 ; RV32-NEXT:    vsseg7e8.v v21, (a1)
-; RV32-NEXT:    vl1r.v v10, (a6)
+; RV32-NEXT:    vl1r.v v18, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1r.v v11, (a6)
+; RV32-NEXT:    vl1r.v v19, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1r.v v12, (a6)
+; RV32-NEXT:    vl1r.v v20, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1r.v v13, (a6)
+; RV32-NEXT:    vl1r.v v21, (a6)
 ; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1r.v v18, (a6)
+; RV32-NEXT:    vl1r.v v10, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1r.v v19, (a6)
-; RV32-NEXT:    vl1r.v v16, (a0)
-; RV32-NEXT:    vl1r.v v8, (a4)
-; RV32-NEXT:    vl1r.v v17, (a3)
-; RV32-NEXT:    vl1r.v v9, (a7)
+; RV32-NEXT:    vl1r.v v11, (a6)
+; RV32-NEXT:    vl1r.v v8, (a0)
+; RV32-NEXT:    vl1r.v v16, (a4)
+; RV32-NEXT:    vl1r.v v9, (a3)
+; RV32-NEXT:    vl1r.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1r.v v20, (a6)
+; RV32-NEXT:    vl1r.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1r.v v21, (a6)
+; RV32-NEXT:    vl1r.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vl1r.v v22, (a6)
-; RV32-NEXT:    vl1r.v v23, (a1)
+; RV32-NEXT:    vl1r.v v14, (a6)
+; RV32-NEXT:    vl1r.v v15, (a1)
 ; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v12, (a5)
-; RV32-NEXT:    vs4r.v v8, (a2)
-; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    vs4r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vl8r.v v16, (a2)
 ; RV32-NEXT:    vl8r.v v8, (a0)
 ; RV32-NEXT:    addi sp, s0, -80
@@ -2363,39 +2381,39 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; RV64-NEXT:    vsseg7e8.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
 ; RV64-NEXT:    vsseg7e8.v v21, (a1)
-; RV64-NEXT:    vl1r.v v10, (a6)
+; RV64-NEXT:    vl1r.v v18, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1r.v v11, (a6)
+; RV64-NEXT:    vl1r.v v19, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1r.v v12, (a6)
+; RV64-NEXT:    vl1r.v v20, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1r.v v13, (a6)
+; RV64-NEXT:    vl1r.v v21, (a6)
 ; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1r.v v18, (a6)
+; RV64-NEXT:    vl1r.v v10, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1r.v v19, (a6)
-; RV64-NEXT:    vl1r.v v16, (a0)
-; RV64-NEXT:    vl1r.v v8, (a4)
-; RV64-NEXT:    vl1r.v v17, (a3)
-; RV64-NEXT:    vl1r.v v9, (a7)
+; RV64-NEXT:    vl1r.v v11, (a6)
+; RV64-NEXT:    vl1r.v v8, (a0)
+; RV64-NEXT:    vl1r.v v16, (a4)
+; RV64-NEXT:    vl1r.v v9, (a3)
+; RV64-NEXT:    vl1r.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1r.v v20, (a6)
+; RV64-NEXT:    vl1r.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1r.v v21, (a6)
+; RV64-NEXT:    vl1r.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vl1r.v v22, (a6)
-; RV64-NEXT:    vl1r.v v23, (a1)
+; RV64-NEXT:    vl1r.v v14, (a6)
+; RV64-NEXT:    vl1r.v v15, (a1)
 ; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v12, (a5)
-; RV64-NEXT:    vs4r.v v8, (a2)
-; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    vs4r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vl8r.v v16, (a2)
 ; RV64-NEXT:    vl8r.v v8, (a0)
 ; RV64-NEXT:    addi sp, s0, -80
@@ -2445,39 +2463,39 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; ZVBB-RV32-NEXT:    vsseg7e8.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
 ; ZVBB-RV32-NEXT:    vsseg7e8.v v21, (a1)
-; ZVBB-RV32-NEXT:    vl1r.v v10, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v18, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1r.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v19, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1r.v v12, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v20, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1r.v v13, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v21, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1r.v v18, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v10, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1r.v v19, (a6)
-; ZVBB-RV32-NEXT:    vl1r.v v16, (a0)
-; ZVBB-RV32-NEXT:    vl1r.v v8, (a4)
-; ZVBB-RV32-NEXT:    vl1r.v v17, (a3)
-; ZVBB-RV32-NEXT:    vl1r.v v9, (a7)
+; ZVBB-RV32-NEXT:    vl1r.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1r.v v16, (a4)
+; ZVBB-RV32-NEXT:    vl1r.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1r.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1r.v v20, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1r.v v21, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vl1r.v v22, (a6)
-; ZVBB-RV32-NEXT:    vl1r.v v23, (a1)
+; ZVBB-RV32-NEXT:    vl1r.v v14, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v15, (a1)
 ; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v12, (a5)
-; ZVBB-RV32-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl8r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vl8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
@@ -2527,39 +2545,39 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; ZVBB-RV64-NEXT:    vsseg7e8.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
 ; ZVBB-RV64-NEXT:    vsseg7e8.v v21, (a1)
-; ZVBB-RV64-NEXT:    vl1r.v v10, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v18, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1r.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v19, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1r.v v12, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v20, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1r.v v13, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v21, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1r.v v18, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v10, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1r.v v19, (a6)
-; ZVBB-RV64-NEXT:    vl1r.v v16, (a0)
-; ZVBB-RV64-NEXT:    vl1r.v v8, (a4)
-; ZVBB-RV64-NEXT:    vl1r.v v17, (a3)
-; ZVBB-RV64-NEXT:    vl1r.v v9, (a7)
+; ZVBB-RV64-NEXT:    vl1r.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1r.v v16, (a4)
+; ZVBB-RV64-NEXT:    vl1r.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1r.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1r.v v20, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1r.v v21, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vl1r.v v22, (a6)
-; ZVBB-RV64-NEXT:    vl1r.v v23, (a1)
+; ZVBB-RV64-NEXT:    vl1r.v v14, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v15, (a1)
 ; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v12, (a5)
-; ZVBB-RV64-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl8r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vl8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
@@ -2615,39 +2633,39 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; RV32-NEXT:    vsseg7e16.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
 ; RV32-NEXT:    vsseg7e16.v v21, (a1)
-; RV32-NEXT:    vl1re16.v v10, (a6)
+; RV32-NEXT:    vl1re16.v v18, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v11, (a6)
+; RV32-NEXT:    vl1re16.v v19, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v12, (a6)
+; RV32-NEXT:    vl1re16.v v20, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v13, (a6)
+; RV32-NEXT:    vl1re16.v v21, (a6)
 ; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re16.v v18, (a6)
+; RV32-NEXT:    vl1re16.v v10, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v19, (a6)
-; RV32-NEXT:    vl1re16.v v16, (a0)
-; RV32-NEXT:    vl1re16.v v8, (a4)
-; RV32-NEXT:    vl1re16.v v17, (a3)
-; RV32-NEXT:    vl1re16.v v9, (a7)
+; RV32-NEXT:    vl1re16.v v11, (a6)
+; RV32-NEXT:    vl1re16.v v8, (a0)
+; RV32-NEXT:    vl1re16.v v16, (a4)
+; RV32-NEXT:    vl1re16.v v9, (a3)
+; RV32-NEXT:    vl1re16.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v20, (a6)
+; RV32-NEXT:    vl1re16.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v21, (a6)
+; RV32-NEXT:    vl1re16.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vl1re16.v v22, (a6)
-; RV32-NEXT:    vl1re16.v v23, (a1)
+; RV32-NEXT:    vl1re16.v v14, (a6)
+; RV32-NEXT:    vl1re16.v v15, (a1)
 ; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v12, (a5)
-; RV32-NEXT:    vs4r.v v8, (a2)
-; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    vs4r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vl8re16.v v16, (a2)
 ; RV32-NEXT:    vl8re16.v v8, (a0)
 ; RV32-NEXT:    addi sp, s0, -80
@@ -2697,39 +2715,39 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; RV64-NEXT:    vsseg7e16.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
 ; RV64-NEXT:    vsseg7e16.v v21, (a1)
-; RV64-NEXT:    vl1re16.v v10, (a6)
+; RV64-NEXT:    vl1re16.v v18, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v11, (a6)
+; RV64-NEXT:    vl1re16.v v19, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v12, (a6)
+; RV64-NEXT:    vl1re16.v v20, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v13, (a6)
+; RV64-NEXT:    vl1re16.v v21, (a6)
 ; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re16.v v18, (a6)
+; RV64-NEXT:    vl1re16.v v10, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v19, (a6)
-; RV64-NEXT:    vl1re16.v v16, (a0)
-; RV64-NEXT:    vl1re16.v v8, (a4)
-; RV64-NEXT:    vl1re16.v v17, (a3)
-; RV64-NEXT:    vl1re16.v v9, (a7)
+; RV64-NEXT:    vl1re16.v v11, (a6)
+; RV64-NEXT:    vl1re16.v v8, (a0)
+; RV64-NEXT:    vl1re16.v v16, (a4)
+; RV64-NEXT:    vl1re16.v v9, (a3)
+; RV64-NEXT:    vl1re16.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v20, (a6)
+; RV64-NEXT:    vl1re16.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v21, (a6)
+; RV64-NEXT:    vl1re16.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vl1re16.v v22, (a6)
-; RV64-NEXT:    vl1re16.v v23, (a1)
+; RV64-NEXT:    vl1re16.v v14, (a6)
+; RV64-NEXT:    vl1re16.v v15, (a1)
 ; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v12, (a5)
-; RV64-NEXT:    vs4r.v v8, (a2)
-; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    vs4r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vl8re16.v v16, (a2)
 ; RV64-NEXT:    vl8re16.v v8, (a0)
 ; RV64-NEXT:    addi sp, s0, -80
@@ -2779,39 +2797,39 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-RV32-NEXT:    vsseg7e16.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
 ; ZVBB-RV32-NEXT:    vsseg7e16.v v21, (a1)
-; ZVBB-RV32-NEXT:    vl1re16.v v10, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v18, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v19, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v20, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v21, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v18, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v10, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v19, (a6)
-; ZVBB-RV32-NEXT:    vl1re16.v v16, (a0)
-; ZVBB-RV32-NEXT:    vl1re16.v v8, (a4)
-; ZVBB-RV32-NEXT:    vl1re16.v v17, (a3)
-; ZVBB-RV32-NEXT:    vl1re16.v v9, (a7)
+; ZVBB-RV32-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re16.v v16, (a4)
+; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re16.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v20, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v21, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v22, (a6)
-; ZVBB-RV32-NEXT:    vl1re16.v v23, (a1)
+; ZVBB-RV32-NEXT:    vl1re16.v v14, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v15, (a1)
 ; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v12, (a5)
-; ZVBB-RV32-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vl8re16.v v8, (a0)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
@@ -2861,39 +2879,39 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-RV64-NEXT:    vsseg7e16.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
 ; ZVBB-RV64-NEXT:    vsseg7e16.v v21, (a1)
-; ZVBB-RV64-NEXT:    vl1re16.v v10, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v18, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v19, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v20, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v21, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v18, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v10, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v19, (a6)
-; ZVBB-RV64-NEXT:    vl1re16.v v16, (a0)
-; ZVBB-RV64-NEXT:    vl1re16.v v8, (a4)
-; ZVBB-RV64-NEXT:    vl1re16.v v17, (a3)
-; ZVBB-RV64-NEXT:    vl1re16.v v9, (a7)
+; ZVBB-RV64-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re16.v v16, (a4)
+; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re16.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v20, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v21, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v22, (a6)
-; ZVBB-RV64-NEXT:    vl1re16.v v23, (a1)
+; ZVBB-RV64-NEXT:    vl1re16.v v14, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v15, (a1)
 ; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v12, (a5)
-; ZVBB-RV64-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vl8re16.v v8, (a0)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
@@ -2949,39 +2967,39 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; RV32-NEXT:    vsseg7e32.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
 ; RV32-NEXT:    vsseg7e32.v v21, (a1)
-; RV32-NEXT:    vl1re32.v v10, (a6)
+; RV32-NEXT:    vl1re32.v v18, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v11, (a6)
+; RV32-NEXT:    vl1re32.v v19, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v12, (a6)
+; RV32-NEXT:    vl1re32.v v20, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v13, (a6)
+; RV32-NEXT:    vl1re32.v v21, (a6)
 ; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re32.v v18, (a6)
+; RV32-NEXT:    vl1re32.v v10, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v19, (a6)
-; RV32-NEXT:    vl1re32.v v16, (a0)
-; RV32-NEXT:    vl1re32.v v8, (a4)
-; RV32-NEXT:    vl1re32.v v17, (a3)
-; RV32-NEXT:    vl1re32.v v9, (a7)
+; RV32-NEXT:    vl1re32.v v11, (a6)
+; RV32-NEXT:    vl1re32.v v8, (a0)
+; RV32-NEXT:    vl1re32.v v16, (a4)
+; RV32-NEXT:    vl1re32.v v9, (a3)
+; RV32-NEXT:    vl1re32.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v20, (a6)
+; RV32-NEXT:    vl1re32.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v21, (a6)
+; RV32-NEXT:    vl1re32.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vl1re32.v v22, (a6)
-; RV32-NEXT:    vl1re32.v v23, (a1)
+; RV32-NEXT:    vl1re32.v v14, (a6)
+; RV32-NEXT:    vl1re32.v v15, (a1)
 ; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v12, (a5)
-; RV32-NEXT:    vs4r.v v8, (a2)
-; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    vs4r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vl8re32.v v16, (a2)
 ; RV32-NEXT:    vl8re32.v v8, (a0)
 ; RV32-NEXT:    addi sp, s0, -80
@@ -3031,39 +3049,39 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; RV64-NEXT:    vsseg7e32.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
 ; RV64-NEXT:    vsseg7e32.v v21, (a1)
-; RV64-NEXT:    vl1re32.v v10, (a6)
+; RV64-NEXT:    vl1re32.v v18, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v11, (a6)
+; RV64-NEXT:    vl1re32.v v19, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v12, (a6)
+; RV64-NEXT:    vl1re32.v v20, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v13, (a6)
+; RV64-NEXT:    vl1re32.v v21, (a6)
 ; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re32.v v18, (a6)
+; RV64-NEXT:    vl1re32.v v10, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v19, (a6)
-; RV64-NEXT:    vl1re32.v v16, (a0)
-; RV64-NEXT:    vl1re32.v v8, (a4)
-; RV64-NEXT:    vl1re32.v v17, (a3)
-; RV64-NEXT:    vl1re32.v v9, (a7)
+; RV64-NEXT:    vl1re32.v v11, (a6)
+; RV64-NEXT:    vl1re32.v v8, (a0)
+; RV64-NEXT:    vl1re32.v v16, (a4)
+; RV64-NEXT:    vl1re32.v v9, (a3)
+; RV64-NEXT:    vl1re32.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v20, (a6)
+; RV64-NEXT:    vl1re32.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v21, (a6)
+; RV64-NEXT:    vl1re32.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vl1re32.v v22, (a6)
-; RV64-NEXT:    vl1re32.v v23, (a1)
+; RV64-NEXT:    vl1re32.v v14, (a6)
+; RV64-NEXT:    vl1re32.v v15, (a1)
 ; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v12, (a5)
-; RV64-NEXT:    vs4r.v v8, (a2)
-; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    vs4r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vl8re32.v v16, (a2)
 ; RV64-NEXT:    vl8re32.v v8, (a0)
 ; RV64-NEXT:    addi sp, s0, -80
@@ -3113,39 +3131,39 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV32-NEXT:    vsseg7e32.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
 ; ZVBB-RV32-NEXT:    vsseg7e32.v v21, (a1)
-; ZVBB-RV32-NEXT:    vl1re32.v v10, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v18, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v19, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v20, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v21, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v18, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v10, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v19, (a6)
-; ZVBB-RV32-NEXT:    vl1re32.v v16, (a0)
-; ZVBB-RV32-NEXT:    vl1re32.v v8, (a4)
-; ZVBB-RV32-NEXT:    vl1re32.v v17, (a3)
-; ZVBB-RV32-NEXT:    vl1re32.v v9, (a7)
+; ZVBB-RV32-NEXT:    vl1re32.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re32.v v16, (a4)
+; ZVBB-RV32-NEXT:    vl1re32.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re32.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v20, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v21, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v22, (a6)
-; ZVBB-RV32-NEXT:    vl1re32.v v23, (a1)
+; ZVBB-RV32-NEXT:    vl1re32.v v14, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v15, (a1)
 ; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v12, (a5)
-; ZVBB-RV32-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl8re32.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vl8re32.v v8, (a0)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
@@ -3195,39 +3213,39 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV64-NEXT:    vsseg7e32.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
 ; ZVBB-RV64-NEXT:    vsseg7e32.v v21, (a1)
-; ZVBB-RV64-NEXT:    vl1re32.v v10, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v18, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v19, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v20, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v21, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v18, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v10, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v19, (a6)
-; ZVBB-RV64-NEXT:    vl1re32.v v16, (a0)
-; ZVBB-RV64-NEXT:    vl1re32.v v8, (a4)
-; ZVBB-RV64-NEXT:    vl1re32.v v17, (a3)
-; ZVBB-RV64-NEXT:    vl1re32.v v9, (a7)
+; ZVBB-RV64-NEXT:    vl1re32.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re32.v v16, (a4)
+; ZVBB-RV64-NEXT:    vl1re32.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re32.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v20, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v21, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v22, (a6)
-; ZVBB-RV64-NEXT:    vl1re32.v v23, (a1)
+; ZVBB-RV64-NEXT:    vl1re32.v v14, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v15, (a1)
 ; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v12, (a5)
-; ZVBB-RV64-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl8re32.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vl8re32.v v8, (a0)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
@@ -3282,39 +3300,39 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV32-NEXT:    vsseg7e64.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
 ; RV32-NEXT:    vsseg7e64.v v21, (a1)
-; RV32-NEXT:    vl1re64.v v10, (a6)
+; RV32-NEXT:    vl1re64.v v18, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v11, (a6)
+; RV32-NEXT:    vl1re64.v v19, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v12, (a6)
+; RV32-NEXT:    vl1re64.v v20, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v13, (a6)
+; RV32-NEXT:    vl1re64.v v21, (a6)
 ; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re64.v v18, (a6)
+; RV32-NEXT:    vl1re64.v v10, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v19, (a6)
-; RV32-NEXT:    vl1re64.v v16, (a0)
-; RV32-NEXT:    vl1re64.v v8, (a4)
-; RV32-NEXT:    vl1re64.v v17, (a3)
-; RV32-NEXT:    vl1re64.v v9, (a7)
+; RV32-NEXT:    vl1re64.v v11, (a6)
+; RV32-NEXT:    vl1re64.v v8, (a0)
+; RV32-NEXT:    vl1re64.v v16, (a4)
+; RV32-NEXT:    vl1re64.v v9, (a3)
+; RV32-NEXT:    vl1re64.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v20, (a6)
+; RV32-NEXT:    vl1re64.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v21, (a6)
+; RV32-NEXT:    vl1re64.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vl1re64.v v22, (a6)
-; RV32-NEXT:    vl1re64.v v23, (a1)
+; RV32-NEXT:    vl1re64.v v14, (a6)
+; RV32-NEXT:    vl1re64.v v15, (a1)
 ; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v12, (a5)
-; RV32-NEXT:    vs4r.v v8, (a2)
-; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    vs4r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vl8re64.v v16, (a2)
 ; RV32-NEXT:    vl8re64.v v8, (a0)
 ; RV32-NEXT:    addi sp, s0, -80
@@ -3364,39 +3382,39 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV64-NEXT:    vsseg7e64.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
 ; RV64-NEXT:    vsseg7e64.v v21, (a1)
-; RV64-NEXT:    vl1re64.v v10, (a6)
+; RV64-NEXT:    vl1re64.v v18, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v11, (a6)
+; RV64-NEXT:    vl1re64.v v19, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v12, (a6)
+; RV64-NEXT:    vl1re64.v v20, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v13, (a6)
+; RV64-NEXT:    vl1re64.v v21, (a6)
 ; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re64.v v18, (a6)
+; RV64-NEXT:    vl1re64.v v10, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v19, (a6)
-; RV64-NEXT:    vl1re64.v v16, (a0)
-; RV64-NEXT:    vl1re64.v v8, (a4)
-; RV64-NEXT:    vl1re64.v v17, (a3)
-; RV64-NEXT:    vl1re64.v v9, (a7)
+; RV64-NEXT:    vl1re64.v v11, (a6)
+; RV64-NEXT:    vl1re64.v v8, (a0)
+; RV64-NEXT:    vl1re64.v v16, (a4)
+; RV64-NEXT:    vl1re64.v v9, (a3)
+; RV64-NEXT:    vl1re64.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v20, (a6)
+; RV64-NEXT:    vl1re64.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v21, (a6)
+; RV64-NEXT:    vl1re64.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vl1re64.v v22, (a6)
-; RV64-NEXT:    vl1re64.v v23, (a1)
+; RV64-NEXT:    vl1re64.v v14, (a6)
+; RV64-NEXT:    vl1re64.v v15, (a1)
 ; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v12, (a5)
-; RV64-NEXT:    vs4r.v v8, (a2)
-; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    vs4r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vl8re64.v v16, (a2)
 ; RV64-NEXT:    vl8re64.v v8, (a0)
 ; RV64-NEXT:    addi sp, s0, -80
@@ -3446,39 +3464,39 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV32-NEXT:    vsseg7e64.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
 ; ZVBB-RV32-NEXT:    vsseg7e64.v v21, (a1)
-; ZVBB-RV32-NEXT:    vl1re64.v v10, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v18, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v19, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v20, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v13, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v21, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v18, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v10, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v19, (a6)
-; ZVBB-RV32-NEXT:    vl1re64.v v16, (a0)
-; ZVBB-RV32-NEXT:    vl1re64.v v8, (a4)
-; ZVBB-RV32-NEXT:    vl1re64.v v17, (a3)
-; ZVBB-RV32-NEXT:    vl1re64.v v9, (a7)
+; ZVBB-RV32-NEXT:    vl1re64.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re64.v v16, (a4)
+; ZVBB-RV32-NEXT:    vl1re64.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re64.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v20, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v21, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v22, (a6)
-; ZVBB-RV32-NEXT:    vl1re64.v v23, (a1)
+; ZVBB-RV32-NEXT:    vl1re64.v v14, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v15, (a1)
 ; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v12, (a5)
-; ZVBB-RV32-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl8re64.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vl8re64.v v8, (a0)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
@@ -3528,39 +3546,39 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV64-NEXT:    vsseg7e64.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
 ; ZVBB-RV64-NEXT:    vsseg7e64.v v21, (a1)
-; ZVBB-RV64-NEXT:    vl1re64.v v10, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v18, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v19, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v20, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v13, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v21, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v18, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v10, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v19, (a6)
-; ZVBB-RV64-NEXT:    vl1re64.v v16, (a0)
-; ZVBB-RV64-NEXT:    vl1re64.v v8, (a4)
-; ZVBB-RV64-NEXT:    vl1re64.v v17, (a3)
-; ZVBB-RV64-NEXT:    vl1re64.v v9, (a7)
+; ZVBB-RV64-NEXT:    vl1re64.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re64.v v16, (a4)
+; ZVBB-RV64-NEXT:    vl1re64.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re64.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v20, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v21, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v22, (a6)
-; ZVBB-RV64-NEXT:    vl1re64.v v23, (a1)
+; ZVBB-RV64-NEXT:    vl1re64.v v14, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v15, (a1)
 ; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v12, (a5)
-; ZVBB-RV64-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl8re64.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vl8re64.v v8, (a0)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
diff --git a/llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll
index c2ac155304dd5..d27f749880ab4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll
@@ -116,8 +116,8 @@ define <vscale x 2 x i64> @vsext_nxv2i8_nxv2i64(<vscale x 2 x i8> %va) {
 ; CHECK-LABEL: vsext_nxv2i8_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf8 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf8 v8, v10
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 2 x i8> %va to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %evec
@@ -127,8 +127,8 @@ define <vscale x 2 x i64> @vzext_nxv2i8_nxv2i64(<vscale x 2 x i8> %va) {
 ; CHECK-LABEL: vzext_nxv2i8_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf8 v8, v10
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 2 x i8> %va to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %evec
@@ -160,8 +160,8 @@ define <vscale x 4 x i32> @vsext_nxv4i8_nxv4i32(<vscale x 4 x i8> %va) {
 ; CHECK-LABEL: vsext_nxv4i8_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsext.vf4 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf4 v8, v10
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 4 x i8> %va to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %evec
@@ -171,8 +171,8 @@ define <vscale x 4 x i32> @vzext_nxv4i8_nxv4i32(<vscale x 4 x i8> %va) {
 ; CHECK-LABEL: vzext_nxv4i8_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf4 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf4 v8, v10
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 4 x i8> %va to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %evec
@@ -182,8 +182,8 @@ define <vscale x 4 x i64> @vsext_nxv4i8_nxv4i64(<vscale x 4 x i8> %va) {
 ; CHECK-LABEL: vsext_nxv4i8_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vsext.vf8 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsext.vf8 v8, v12
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 4 x i8> %va to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %evec
@@ -193,8 +193,8 @@ define <vscale x 4 x i64> @vzext_nxv4i8_nxv4i64(<vscale x 4 x i8> %va) {
 ; CHECK-LABEL: vzext_nxv4i8_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vzext.vf8 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vzext.vf8 v8, v12
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 4 x i8> %va to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %evec
@@ -204,8 +204,8 @@ define <vscale x 8 x i16> @vsext_nxv8i8_nxv8i16(<vscale x 8 x i8> %va) {
 ; CHECK-LABEL: vsext_nxv8i8_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 8 x i8> %va to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %evec
@@ -215,8 +215,8 @@ define <vscale x 8 x i16> @vzext_nxv8i8_nxv8i16(<vscale x 8 x i8> %va) {
 ; CHECK-LABEL: vzext_nxv8i8_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 8 x i8> %va to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %evec
@@ -226,8 +226,8 @@ define <vscale x 8 x i32> @vsext_nxv8i8_nxv8i32(<vscale x 8 x i8> %va) {
 ; CHECK-LABEL: vsext_nxv8i8_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsext.vf4 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsext.vf4 v8, v12
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 8 x i8> %va to <vscale x 8 x i32>
   ret <vscale x 8 x i32> %evec
@@ -237,8 +237,8 @@ define <vscale x 8 x i32> @vzext_nxv8i8_nxv8i32(<vscale x 8 x i8> %va) {
 ; CHECK-LABEL: vzext_nxv8i8_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf4 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vzext.vf4 v8, v12
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 8 x i8> %va to <vscale x 8 x i32>
   ret <vscale x 8 x i32> %evec
@@ -248,8 +248,8 @@ define <vscale x 8 x i64> @vsext_nxv8i8_nxv8i64(<vscale x 8 x i8> %va) {
 ; CHECK-LABEL: vsext_nxv8i8_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsext.vf8 v8, v16
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 8 x i8> %va to <vscale x 8 x i64>
   ret <vscale x 8 x i64> %evec
@@ -259,8 +259,8 @@ define <vscale x 8 x i64> @vzext_nxv8i8_nxv8i64(<vscale x 8 x i8> %va) {
 ; CHECK-LABEL: vzext_nxv8i8_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vzext.vf8 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vzext.vf8 v8, v16
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 8 x i8> %va to <vscale x 8 x i64>
   ret <vscale x 8 x i64> %evec
@@ -270,8 +270,8 @@ define <vscale x 16 x i16> @vsext_nxv16i8_nxv16i16(<vscale x 16 x i8> %va) {
 ; CHECK-LABEL: vsext_nxv16i8_nxv16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsext.vf2 v8, v12
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 16 x i8> %va to <vscale x 16 x i16>
   ret <vscale x 16 x i16> %evec
@@ -281,8 +281,8 @@ define <vscale x 16 x i16> @vzext_nxv16i8_nxv16i16(<vscale x 16 x i8> %va) {
 ; CHECK-LABEL: vzext_nxv16i8_nxv16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vzext.vf2 v8, v12
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 16 x i8> %va to <vscale x 16 x i16>
   ret <vscale x 16 x i16> %evec
@@ -292,8 +292,8 @@ define <vscale x 16 x i32> @vsext_nxv16i8_nxv16i32(<vscale x 16 x i8> %va) {
 ; CHECK-LABEL: vsext_nxv16i8_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vsext.vf4 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsext.vf4 v8, v16
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 16 x i8> %va to <vscale x 16 x i32>
   ret <vscale x 16 x i32> %evec
@@ -303,8 +303,8 @@ define <vscale x 16 x i32> @vzext_nxv16i8_nxv16i32(<vscale x 16 x i8> %va) {
 ; CHECK-LABEL: vzext_nxv16i8_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vzext.vf4 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vzext.vf4 v8, v16
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 16 x i8> %va to <vscale x 16 x i32>
   ret <vscale x 16 x i32> %evec
@@ -314,8 +314,8 @@ define <vscale x 32 x i16> @vsext_nxv32i8_nxv32i16(<vscale x 32 x i8> %va) {
 ; CHECK-LABEL: vsext_nxv32i8_nxv32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vsext.vf2 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsext.vf2 v8, v16
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 32 x i8> %va to <vscale x 32 x i16>
   ret <vscale x 32 x i16> %evec
@@ -325,8 +325,8 @@ define <vscale x 32 x i16> @vzext_nxv32i8_nxv32i16(<vscale x 32 x i8> %va) {
 ; CHECK-LABEL: vzext_nxv32i8_nxv32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vzext.vf2 v8, v16
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 32 x i8> %va to <vscale x 32 x i16>
   ret <vscale x 32 x i16> %evec
@@ -402,8 +402,8 @@ define <vscale x 2 x i64> @vsext_nxv2i16_nxv2i64(<vscale x 2 x i16> %va) {
 ; CHECK-LABEL: vsext_nxv2i16_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf4 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf4 v8, v10
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 2 x i16> %va to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %evec
@@ -413,8 +413,8 @@ define <vscale x 2 x i64> @vzext_nxv2i16_nxv2i64(<vscale x 2 x i16> %va) {
 ; CHECK-LABEL: vzext_nxv2i16_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf4 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf4 v8, v10
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 2 x i16> %va to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %evec
@@ -424,8 +424,8 @@ define <vscale x 4 x i32> @vsext_nxv4i16_nxv4i32(<vscale x 4 x i16> %va) {
 ; CHECK-LABEL: vsext_nxv4i16_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 4 x i16> %va to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %evec
@@ -435,8 +435,8 @@ define <vscale x 4 x i32> @vzext_nxv4i16_nxv4i32(<vscale x 4 x i16> %va) {
 ; CHECK-LABEL: vzext_nxv4i16_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 4 x i16> %va to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %evec
@@ -446,8 +446,8 @@ define <vscale x 4 x i64> @vsext_nxv4i16_nxv4i64(<vscale x 4 x i16> %va) {
 ; CHECK-LABEL: vsext_nxv4i16_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vsext.vf4 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsext.vf4 v8, v12
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 4 x i16> %va to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %evec
@@ -457,8 +457,8 @@ define <vscale x 4 x i64> @vzext_nxv4i16_nxv4i64(<vscale x 4 x i16> %va) {
 ; CHECK-LABEL: vzext_nxv4i16_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vzext.vf4 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vzext.vf4 v8, v12
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 4 x i16> %va to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %evec
@@ -468,8 +468,8 @@ define <vscale x 8 x i32> @vsext_nxv8i16_nxv8i32(<vscale x 8 x i16> %va) {
 ; CHECK-LABEL: vsext_nxv8i16_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsext.vf2 v8, v12
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 8 x i16> %va to <vscale x 8 x i32>
   ret <vscale x 8 x i32> %evec
@@ -479,8 +479,8 @@ define <vscale x 8 x i32> @vzext_nxv8i16_nxv8i32(<vscale x 8 x i16> %va) {
 ; CHECK-LABEL: vzext_nxv8i16_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vzext.vf2 v8, v12
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 8 x i16> %va to <vscale x 8 x i32>
   ret <vscale x 8 x i32> %evec
@@ -490,8 +490,8 @@ define <vscale x 8 x i64> @vsext_nxv8i16_nxv8i64(<vscale x 8 x i16> %va) {
 ; CHECK-LABEL: vsext_nxv8i16_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf4 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsext.vf4 v8, v16
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 8 x i16> %va to <vscale x 8 x i64>
   ret <vscale x 8 x i64> %evec
@@ -501,8 +501,8 @@ define <vscale x 8 x i64> @vzext_nxv8i16_nxv8i64(<vscale x 8 x i16> %va) {
 ; CHECK-LABEL: vzext_nxv8i16_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vzext.vf4 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vzext.vf4 v8, v16
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 8 x i16> %va to <vscale x 8 x i64>
   ret <vscale x 8 x i64> %evec
@@ -512,8 +512,8 @@ define <vscale x 16 x i32> @vsext_nxv16i16_nxv16i32(<vscale x 16 x i16> %va) {
 ; CHECK-LABEL: vsext_nxv16i16_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vsext.vf2 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsext.vf2 v8, v16
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 16 x i16> %va to <vscale x 16 x i32>
   ret <vscale x 16 x i32> %evec
@@ -523,8 +523,8 @@ define <vscale x 16 x i32> @vzext_nxv16i16_nxv16i32(<vscale x 16 x i16> %va) {
 ; CHECK-LABEL: vzext_nxv16i16_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vzext.vf2 v8, v16
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 16 x i16> %va to <vscale x 16 x i32>
   ret <vscale x 16 x i32> %evec
@@ -556,8 +556,8 @@ define <vscale x 2 x i64> @vsext_nxv2i32_nxv2i64(<vscale x 2 x i32> %va) {
 ; CHECK-LABEL: vsext_nxv2i32_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 2 x i32> %va to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %evec
@@ -567,8 +567,8 @@ define <vscale x 2 x i64> @vzext_nxv2i32_nxv2i64(<vscale x 2 x i32> %va) {
 ; CHECK-LABEL: vzext_nxv2i32_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 2 x i32> %va to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %evec
@@ -578,8 +578,8 @@ define <vscale x 4 x i64> @vsext_nxv4i32_nxv4i64(<vscale x 4 x i32> %va) {
 ; CHECK-LABEL: vsext_nxv4i32_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsext.vf2 v8, v12
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 4 x i32> %va to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %evec
@@ -589,8 +589,8 @@ define <vscale x 4 x i64> @vzext_nxv4i32_nxv4i64(<vscale x 4 x i32> %va) {
 ; CHECK-LABEL: vzext_nxv4i32_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vzext.vf2 v8, v12
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %evec
@@ -600,8 +600,8 @@ define <vscale x 8 x i64> @vsext_nxv8i32_nxv8i64(<vscale x 8 x i32> %va) {
 ; CHECK-LABEL: vsext_nxv8i32_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf2 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsext.vf2 v8, v16
 ; CHECK-NEXT:    ret
   %evec = sext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   ret <vscale x 8 x i64> %evec
@@ -611,8 +611,8 @@ define <vscale x 8 x i64> @vzext_nxv8i32_nxv8i64(<vscale x 8 x i32> %va) {
 ; CHECK-LABEL: vzext_nxv8i32_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vzext.vf2 v8, v16
 ; CHECK-NEXT:    ret
   %evec = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   ret <vscale x 8 x i64> %evec
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll
index 0faebe87f2657..4078b1d2fefdc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll
@@ -103,11 +103,11 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v10, v12
+; CHECK-NEXT:    vfadd.vv v10, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
@@ -138,11 +138,11 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmv.v.x v16, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v12, v16
+; CHECK-NEXT:    vfadd.vv v12, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -173,11 +173,11 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmv.v.x v24, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24
+; CHECK-NEXT:    vfadd.vv v16, v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
@@ -397,11 +397,11 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b) s
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -445,11 +445,11 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b) s
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -493,11 +493,11 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a0
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 8354a55bd1867..3fe15c123ac04 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -227,11 +227,11 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v10, v12, v0.t
+; CHECK-NEXT:    vfadd.vv v10, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
@@ -246,11 +246,11 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v10, v12
+; CHECK-NEXT:    vfadd.vv v10, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
@@ -297,11 +297,11 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v12, v16, v0.t
+; CHECK-NEXT:    vfadd.vv v12, v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
@@ -316,11 +316,11 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v12, v16
+; CHECK-NEXT:    vfadd.vv v12, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -367,11 +367,11 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vmv.v.x v24, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vfadd.vv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
@@ -386,11 +386,11 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vmv.v.x v24, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24
+; CHECK-NEXT:    vfadd.vv v16, v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
@@ -925,11 +925,11 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -950,11 +950,11 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -1019,11 +1019,11 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -1044,11 +1044,11 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -1113,11 +1113,11 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -1138,11 +1138,11 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
index beb56a2645a1c..83f588ce5027d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
@@ -1904,11 +1904,11 @@ define <vscale x 8 x double> @vfcopysign_exttrunc_vv_nxv8f64_nxv8f16(<vscale x 8
 ; CHECK-LABEL: vfcopysign_exttrunc_vv_nxv8f64_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v20, v16
+; CHECK-NEXT:    vfwcvt.f.f.v v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v24, v20
+; CHECK-NEXT:    vfwcvt.f.f.v v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfsgnj.vv v8, v8, v24
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %e = fpext <vscale x 8 x half> %vs to <vscale x 8 x double>
   %r = call <vscale x 8 x double> @llvm.copysign.nxv8f64(<vscale x 8 x double> %vm, <vscale x 8 x double> %e)
@@ -1933,11 +1933,11 @@ define <vscale x 8 x double> @vfcopynsign_exttrunc_vv_nxv8f64_nxv8f16(<vscale x
 ; CHECK-LABEL: vfcopynsign_exttrunc_vv_nxv8f64_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v20, v16
+; CHECK-NEXT:    vfwcvt.f.f.v v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v24, v20
+; CHECK-NEXT:    vfwcvt.f.f.v v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfsgnjn.vv v8, v8, v24
+; CHECK-NEXT:    vfsgnjn.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %n = fneg <vscale x 8 x half> %vs
   %eneg = fpext <vscale x 8 x half> %n to <vscale x 8 x double>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
index 217a02d08dead..3ae5a47fe19f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
@@ -103,11 +103,11 @@ define <vscale x 4 x bfloat> @vfdiv_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfdiv.vv v10, v10, v12
+; CHECK-NEXT:    vfdiv.vv v10, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
@@ -138,11 +138,11 @@ define <vscale x 8 x bfloat> @vfdiv_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmv.v.x v16, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfdiv.vv v12, v12, v16
+; CHECK-NEXT:    vfdiv.vv v12, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -157,11 +157,11 @@ define <vscale x 8 x bfloat> @vfdiv_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmv.v.x v16, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfdiv.vv v12, v16, v12
+; CHECK-NEXT:    vfdiv.vv v12, v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -192,11 +192,11 @@ define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmv.v.x v24, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v16, v24
+; CHECK-NEXT:    vfdiv.vv v16, v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
@@ -426,11 +426,11 @@ define <vscale x 4 x half> @vfdiv_vf_nxv4f16(<vscale x 4 x half> %va, half %b) s
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -474,11 +474,11 @@ define <vscale x 8 x half> @vfdiv_vf_nxv8f16(<vscale x 8 x half> %va, half %b) s
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -499,11 +499,11 @@ define <vscale x 8 x half> @vfdiv_fv_nxv8f16(<vscale x 8 x half> %va, half %b) s
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v12, v16, v12
+; ZVFHMIN-NEXT:    vfdiv.vv v12, v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -547,11 +547,11 @@ define <vscale x 16 x half> @vfdiv_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a0
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index f3e483f3686a0..80c0ee2e873c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -189,11 +189,11 @@ define <vscale x 4 x bfloat> @vfdiv_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfdiv.vv v10, v10, v12, v0.t
+; CHECK-NEXT:    vfdiv.vv v10, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
@@ -208,11 +208,11 @@ define <vscale x 4 x bfloat> @vfdiv_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfdiv.vv v10, v10, v12
+; CHECK-NEXT:    vfdiv.vv v10, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
@@ -259,11 +259,11 @@ define <vscale x 8 x bfloat> @vfdiv_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfdiv.vv v12, v12, v16, v0.t
+; CHECK-NEXT:    vfdiv.vv v12, v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
@@ -278,11 +278,11 @@ define <vscale x 8 x bfloat> @vfdiv_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfdiv.vv v12, v12, v16
+; CHECK-NEXT:    vfdiv.vv v12, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -329,11 +329,11 @@ define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vmv.v.x v24, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vfdiv.vv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
@@ -348,11 +348,11 @@ define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vmv.v.x v24, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v16, v24
+; CHECK-NEXT:    vfdiv.vv v16, v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
@@ -837,11 +837,11 @@ define <vscale x 4 x half> @vfdiv_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -862,11 +862,11 @@ define <vscale x 4 x half> @vfdiv_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -931,11 +931,11 @@ define <vscale x 8 x half> @vfdiv_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -956,11 +956,11 @@ define <vscale x 8 x half> @vfdiv_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -1025,11 +1025,11 @@ define <vscale x 16 x half> @vfdiv_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -1050,11 +1050,11 @@ define <vscale x 16 x half> @vfdiv_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index c8a8dc98cbbe7..a2869999094a5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -300,11 +300,11 @@ define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16_commute(<vscale x 4 x bfloat> %va
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmv.v.x v14, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v14, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v12, v14, v10, v0.t
+; CHECK-NEXT:    vfmadd.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
@@ -414,11 +414,11 @@ define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16_commute(<vscale x 8 x bfloat> %va
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmv.v.x v20, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v20, v12, v0.t
+; CHECK-NEXT:    vfmadd.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
@@ -1185,13 +1185,13 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked_commute(<vscale x 32 x
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    vmset.m v8
+; CHECK-NEXT:    vmv.v.x v24, a1
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
+; CHECK-NEXT:    vslidedown.vx v0, v8, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
@@ -1201,15 +1201,15 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked_commute(<vscale x 32 x
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
 ; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    mv a3, a2
@@ -1660,11 +1660,11 @@ define <vscale x 4 x half> @vfma_vf_nxv4f16_commute(<vscale x 4 x half> %va, hal
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v14, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -1811,11 +1811,11 @@ define <vscale x 8 x half> @vfma_vf_nxv8f16_commute(<vscale x 8 x half> %va, hal
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v20, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -2670,13 +2670,13 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vmset.m v8
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
@@ -2686,15 +2686,15 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -3496,12 +3496,12 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    mv a3, a2
@@ -3511,7 +3511,7 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a2, a0
@@ -3529,16 +3529,16 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
@@ -5391,12 +5391,12 @@ define <vscale x 4 x half> @vfmsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -5441,16 +5441,16 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16(<vscale x 4 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5470,16 +5470,16 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16_commute(<vscale x 4 x half> %va, h
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5499,16 +5499,16 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vxor.vx v12, v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5528,16 +5528,16 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16_unmasked_commute(<vscale x 4 x hal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vxor.vx v12, v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5559,14 +5559,14 @@ define <vscale x 4 x half> @vfnmadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v14, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   %negc = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %c, <vscale x 4 x i1> %m, i32 %evl)
@@ -5586,12 +5586,12 @@ define <vscale x 4 x half> @vfnmadd_vv_nxv4f16_commuted(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -5613,12 +5613,12 @@ define <vscale x 4 x half> @vfnmadd_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -5640,12 +5640,12 @@ define <vscale x 4 x half> @vfnmadd_vv_nxv4f16_unmasked_commuted(<vscale x 4 x h
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -5666,17 +5666,17 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v10, v9, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5697,17 +5697,17 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_commute(<vscale x 4 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v10, v9, a0, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v10, v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5728,17 +5728,17 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vxor.vx v12, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5759,17 +5759,17 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_unmasked_commute(<vscale x 4 x ha
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vxor.vx v12, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5792,15 +5792,15 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_neg_splat(<vscale x 4 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v14, v10, a0, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v14, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5823,15 +5823,15 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_neg_splat_commute(<vscale x 4 x h
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v14, v10, a0, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5855,14 +5855,14 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_neg_splat_unmasked(<vscale x 4 x
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vxor.vx v14, v10, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5886,14 +5886,14 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vxor.vx v14, v10, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5916,14 +5916,14 @@ define <vscale x 4 x half> @vfnmsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v14, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   %negc = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %c, <vscale x 4 x i1> %m, i32 %evl)
@@ -5943,12 +5943,12 @@ define <vscale x 4 x half> @vfnmsub_vv_nxv4f16_commuted(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -5970,12 +5970,12 @@ define <vscale x 4 x half> @vfnmsub_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -5997,12 +5997,12 @@ define <vscale x 4 x half> @vfnmsub_vv_nxv4f16_unmasked_commuted(<vscale x 4 x h
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -6023,16 +6023,16 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16(<vscale x 4 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6052,16 +6052,16 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_commute(<vscale x 4 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v14, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v10, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6081,16 +6081,16 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v12, v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6110,16 +6110,16 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_unmasked_commute(<vscale x 4 x ha
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v12, v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6141,14 +6141,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_neg_splat(<vscale x 4 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v14, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6170,12 +6170,12 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_neg_splat_commute(<vscale x 4 x h
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -6197,14 +6197,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_neg_splat_unmasked(<vscale x 4 x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v9, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -6226,14 +6226,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v9, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -6258,12 +6258,12 @@ define <vscale x 8 x half> @vfmsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -6308,16 +6308,16 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6337,16 +6337,16 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16_commute(<vscale x 8 x half> %va, h
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6366,16 +6366,16 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vxor.vx v8, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vxor.vx v16, v10, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6395,16 +6395,16 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16_unmasked_commute(<vscale x 8 x hal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vxor.vx v8, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vxor.vx v16, v10, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6426,14 +6426,14 @@ define <vscale x 8 x half> @vfnmadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v20, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   %negc = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %c, <vscale x 8 x i1> %m, i32 %evl)
@@ -6453,12 +6453,12 @@ define <vscale x 8 x half> @vfnmadd_vv_nxv8f16_commuted(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -6480,12 +6480,12 @@ define <vscale x 8 x half> @vfnmadd_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -6507,12 +6507,12 @@ define <vscale x 8 x half> @vfnmadd_vv_nxv8f16_unmasked_commuted(<vscale x 8 x h
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -6533,17 +6533,17 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6564,17 +6564,17 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_commute(<vscale x 8 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a0, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6595,17 +6595,17 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6626,17 +6626,17 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_unmasked_commute(<vscale x 8 x ha
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6659,15 +6659,15 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_neg_splat(<vscale x 8 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v20, v12, a0, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v20, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6690,15 +6690,15 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_neg_splat_commute(<vscale x 8 x h
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v20, v12, a0, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6722,14 +6722,14 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_neg_splat_unmasked(<vscale x 8 x
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vxor.vx v20, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6753,14 +6753,14 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vxor.vx v20, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6783,14 +6783,14 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v20, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   %negc = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %c, <vscale x 8 x i1> %m, i32 %evl)
@@ -6810,12 +6810,12 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16_commuted(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -6837,12 +6837,12 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -6864,12 +6864,12 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16_unmasked_commuted(<vscale x 8 x h
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -6890,16 +6890,16 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6919,16 +6919,16 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_commute(<vscale x 8 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v20, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6948,16 +6948,16 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6977,16 +6977,16 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_unmasked_commute(<vscale x 8 x ha
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -7008,14 +7008,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_neg_splat(<vscale x 8 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v20, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -7037,12 +7037,12 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_neg_splat_commute(<vscale x 8 x h
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -7064,14 +7064,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_neg_splat_unmasked(<vscale x 8 x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vxor.vx v10, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vxor.vx v10, v16, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -7093,14 +7093,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vxor.vx v10, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vxor.vx v10, v16, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -7127,8 +7127,8 @@ define <vscale x 16 x half> @vfmsub_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v12
 ; ZVFHMIN-NEXT:    vmv4r.v v20, v8
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v24, v16, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -7234,34 +7234,18 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16_unmasked(<vscale x 16 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vxor.vx v8, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vxor.vx v24, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7279,34 +7263,18 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16_unmasked_commute(<vscale x 16 x
 ;
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv16f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vxor.vx v8, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vxor.vx v24, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7328,10 +7296,10 @@ define <vscale x 16 x half> @vfnmadd_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v8
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v24, v16, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
@@ -7357,8 +7325,8 @@ define <vscale x 16 x half> @vfnmadd_vv_nxv16f16_commuted(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v8
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v20, v12, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v24, v16, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -7384,12 +7352,12 @@ define <vscale x 16 x half> @vfnmadd_vv_nxv16f16_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
@@ -7411,12 +7379,12 @@ define <vscale x 16 x half> @vfnmadd_vv_nxv16f16_unmasked_commuted(<vscale x 16
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
@@ -7439,15 +7407,15 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16(<vscale x 16 x half> %va, half
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v20, v8, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v24, v12, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v24, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7470,10 +7438,10 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_commute(<vscale x 16 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v28, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v24, v8, a0, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v16, v12, a0, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
@@ -7497,35 +7465,19 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_unmasked(<vscale x 16 x half> %
 ;
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vxor.vx v24, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7541,38 +7493,22 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_unmasked_commute(<vscale x 16 x
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v12
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv16f16_unmasked_commute:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+;
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv16f16_unmasked_commute:
+; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vxor.vx v24, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v16, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7591,35 +7527,20 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_neg_splat(<vscale x 16 x half>
 ;
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv16f16_neg_splat:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    vmv4r.v v4, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v4, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7638,35 +7559,20 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_neg_splat_commute(<vscale x 16
 ;
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv16f16_neg_splat_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    vmv4r.v v4, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v4, v16, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v12, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v20, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7690,14 +7596,14 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_neg_splat_unmasked(<vscale x 16
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vxor.vx v4, v16, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7721,14 +7627,14 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vxor.vx v4, v16, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7751,10 +7657,10 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v8
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v24, v16, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a0, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
@@ -7780,8 +7686,8 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16_commuted(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v8
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v20, v12, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v24, v16, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -7807,12 +7713,12 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
@@ -7834,12 +7740,12 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16_unmasked_commuted(<vscale x 16
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
@@ -7917,34 +7823,18 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_unmasked(<vscale x 16 x half> %
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vxor.vx v24, v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7962,34 +7852,18 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_unmasked_commute(<vscale x 16 x
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv16f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vxor.vx v24, v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8010,10 +7884,10 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_neg_splat(<vscale x 16 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v24, v16, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -8038,14 +7912,14 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_neg_splat_commute(<vscale x 16
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv16f16_neg_splat_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv4r.v v20, v12
+; ZVFHMIN-NEXT:    vmv4r.v v16, v12
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v24, v16, a0, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v20, v8, a0, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
@@ -8070,14 +7944,14 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_neg_splat_unmasked(<vscale x 16
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vxor.vx v12, v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vxor.vx v12, v24, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
@@ -8099,14 +7973,14 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vxor.vx v12, v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vxor.vx v12, v24, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
@@ -8275,18 +8149,18 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv8r.v v24, v8
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
+; ZVFHMIN-NEXT:    vl8re16.v v16, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vmset.m v16
+; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
 ; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v8, a2
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    vmv4r.v v8, v16
@@ -9294,35 +9168,35 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
 ; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v24, a2
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -10179,17 +10053,17 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v0, a1
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v16, v0, a2
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
@@ -10197,17 +10071,17 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -10302,10 +10176,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v16, v24, a2
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
@@ -10316,17 +10190,17 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -10810,35 +10684,35 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
 ; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v24, a2
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -11684,45 +11558,49 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2
+; ZVFHMIN-NEXT:    vxor.vx v16, v24, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
@@ -11747,16 +11625,16 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
index f6769601f488e..a03629d17e612 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
@@ -161,16 +161,16 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
+; ZVFHMIN-NEXT:    vmv.v.x v14, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vxor.vx v8, v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vxor.vx v12, v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -217,16 +217,16 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a0
+; ZVFHMIN-NEXT:    vmv.v.x v20, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -246,32 +246,16 @@ define <vscale x 16 x half> @vfmsub_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vxor.vx v8, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vxor.vx v20, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 16 x half> %vb
   %vd = call <vscale x 16 x half> @llvm.experimental.constrained.fma.nxv16f16(<vscale x 16 x half> %vc, <vscale x 16 x half> %va, <vscale x 16 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -287,34 +271,18 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16(<vscale x 16 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v4, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vxor.vx v8, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vxor.vx v24, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll
index 9e6b1f6fefd08..e083e2e1fd072 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll
@@ -103,11 +103,11 @@ define <vscale x 4 x bfloat> @vfmul_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmul.vv v10, v10, v12
+; CHECK-NEXT:    vfmul.vv v10, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
@@ -138,11 +138,11 @@ define <vscale x 8 x bfloat> @vfmul_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmv.v.x v16, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfmul.vv v12, v12, v16
+; CHECK-NEXT:    vfmul.vv v12, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -173,11 +173,11 @@ define <vscale x 16 x bfloat> @vfmul_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmv.v.x v24, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmul.vv v16, v16, v24
+; CHECK-NEXT:    vfmul.vv v16, v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
@@ -397,11 +397,11 @@ define <vscale x 4 x half> @vfmul_vf_nxv4f16(<vscale x 4 x half> %va, half %b) s
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -445,11 +445,11 @@ define <vscale x 8 x half> @vfmul_vf_nxv8f16(<vscale x 8 x half> %va, half %b) s
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -493,11 +493,11 @@ define <vscale x 16 x half> @vfmul_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a0
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index 67d5f1b7c82ef..fc9c798a48bdf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -251,11 +251,11 @@ define <vscale x 4 x half> @vfmul_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -276,11 +276,11 @@ define <vscale x 4 x half> @vfmul_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -345,11 +345,11 @@ define <vscale x 8 x half> @vfmul_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -370,11 +370,11 @@ define <vscale x 8 x half> @vfmul_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -439,11 +439,11 @@ define <vscale x 16 x half> @vfmul_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -464,11 +464,11 @@ define <vscale x 16 x half> @vfmul_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll
index b8ec285b5c34e..e4b467c12791d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll
@@ -145,12 +145,12 @@ define <vscale x 4 x half> @vfnmsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v12, v10, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -174,14 +174,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vxor.vx v14, v10, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -204,15 +204,15 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vxor.vx v14, v10, a0
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v14
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 8 x half> %vb
   %neg2 = fneg <vscale x 8 x half> %va
@@ -234,14 +234,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vxor.vx v20, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -264,15 +264,15 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
+; ZVFHMIN-NEXT:    vxor.vx v4, v16, a0
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 16 x half> %vc
   %neg2 = fneg <vscale x 16 x half> %vb
@@ -294,14 +294,14 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vxor.vx v4, v16, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll
index 742083f6f684b..ff612bdcd2725 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll
@@ -161,14 +161,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v9, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -193,13 +193,13 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vxor.vx v8, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vxor.vx v14, v10, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v14
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 8 x half> %vb
   %vd = call <vscale x 8 x half> @llvm.experimental.constrained.fma.nxv8f16(<vscale x 8 x half> %neg, <vscale x 8 x half> %vc, <vscale x 8 x half> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -217,14 +217,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vxor.vx v8, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -273,14 +273,14 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vxor.vx v12, v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vxor.vx v12, v24, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-constrained-sdnode.ll
index 3999b97d6e256..9c77a6818bcb2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-constrained-sdnode.ll
@@ -59,8 +59,8 @@ define <vscale x 4 x float> @vfpext_nxv4f16_nxv4f32(<vscale x 4 x half> %va) str
 ; CHECK-LABEL: vfpext_nxv4f16_nxv4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 4 x float> @llvm.experimental.constrained.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %va, metadata !"fpexcept.strict")
   ret <vscale x 4 x float> %evec
@@ -84,8 +84,8 @@ define <vscale x 8 x float> @vfpext_nxv8f16_nxv8f32(<vscale x 8 x half> %va) str
 ; CHECK-LABEL: vfpext_nxv8f16_nxv8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x float> @llvm.experimental.constrained.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %va, metadata !"fpexcept.strict")
   ret <vscale x 8 x float> %evec
@@ -121,8 +121,8 @@ define <vscale x 2 x double> @vfpext_nxv2f32_nxv2f64(<vscale x 2 x float> %va) s
 ; CHECK-LABEL: vfpext_nxv2f32_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 2 x double> @llvm.experimental.constrained.fpext.nxv2f64.nxv2f32(<vscale x 2 x float> %va, metadata !"fpexcept.strict")
   ret <vscale x 2 x double> %evec
@@ -133,8 +133,8 @@ define <vscale x 4 x double> @vfpext_nxv4f32_nxv4f64(<vscale x 4 x float> %va) s
 ; CHECK-LABEL: vfpext_nxv4f32_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 4 x double> @llvm.experimental.constrained.fpext.nxv4f64.nxv4f32(<vscale x 4 x float> %va, metadata !"fpexcept.strict")
   ret <vscale x 4 x double> %evec
@@ -145,8 +145,8 @@ define <vscale x 8 x double> @vfpext_nxv8f32_nxv8f64(<vscale x 8 x float> %va) s
 ; CHECK-LABEL: vfpext_nxv8f32_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x double> @llvm.experimental.constrained.fpext.nxv8f64.nxv8f32(<vscale x 8 x float> %va, metadata !"fpexcept.strict")
   ret <vscale x 8 x double> %evec
@@ -207,8 +207,8 @@ define <vscale x 4 x float> @vfpext_nxv4bf16_nxv4f32(<vscale x 4 x bfloat> %va)
 ; CHECK-LABEL: vfpext_nxv4bf16_nxv4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 4 x float> @llvm.experimental.constrained.fpext.nxv4f32.nxv4bf16(<vscale x 4 x bfloat> %va, metadata !"fpexcept.strict")
   ret <vscale x 4 x float> %evec
@@ -232,8 +232,8 @@ define <vscale x 8 x float> @vfpext_nxv8bf16_nxv8f32(<vscale x 8 x bfloat> %va)
 ; CHECK-LABEL: vfpext_nxv8bf16_nxv8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x float> @llvm.experimental.constrained.fpext.nxv8f32.nxv8bf16(<vscale x 8 x bfloat> %va, metadata !"fpexcept.strict")
   ret <vscale x 8 x float> %evec
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll
index 3b2de0185f90c..557cc3fea1592 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll
@@ -63,8 +63,8 @@ define <vscale x 4 x float> @vfpext_nxv4f16_nxv4f32(<vscale x 4 x half> %va) {
 ; CHECK-LABEL: vfpext_nxv4f16_nxv4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = fpext <vscale x 4 x half> %va to <vscale x 4 x float>
   ret <vscale x 4 x float> %evec
@@ -88,8 +88,8 @@ define <vscale x 8 x float> @vfpext_nxv8f16_nxv8f32(<vscale x 8 x half> %va) {
 ; CHECK-LABEL: vfpext_nxv8f16_nxv8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = fpext <vscale x 8 x half> %va to <vscale x 8 x float>
   ret <vscale x 8 x float> %evec
@@ -113,8 +113,8 @@ define <vscale x 16 x float> @vfpext_nxv16f16_nxv16f32(<vscale x 16 x half> %va)
 ; CHECK-LABEL: vfpext_nxv16f16_nxv16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = fpext <vscale x 16 x half> %va to <vscale x 16 x float>
   ret <vscale x 16 x float> %evec
@@ -137,8 +137,8 @@ define <vscale x 2 x double> @vfpext_nxv2f32_nxv2f64(<vscale x 2 x float> %va) {
 ; CHECK-LABEL: vfpext_nxv2f32_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = fpext <vscale x 2 x float> %va to <vscale x 2 x double>
   ret <vscale x 2 x double> %evec
@@ -149,8 +149,8 @@ define <vscale x 4 x double> @vfpext_nxv4f32_nxv4f64(<vscale x 4 x float> %va) {
 ; CHECK-LABEL: vfpext_nxv4f32_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = fpext <vscale x 4 x float> %va to <vscale x 4 x double>
   ret <vscale x 4 x double> %evec
@@ -161,8 +161,8 @@ define <vscale x 8 x double> @vfpext_nxv8f32_nxv8f64(<vscale x 8 x float> %va) {
 ; CHECK-LABEL: vfpext_nxv8f32_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = fpext <vscale x 8 x float> %va to <vscale x 8 x double>
   ret <vscale x 8 x double> %evec
@@ -223,8 +223,8 @@ define <vscale x 4 x float> @vfpext_nxv4bf16_nxv4f32(<vscale x 4 x bfloat> %va)
 ; CHECK-LABEL: vfpext_nxv4bf16_nxv4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = fpext <vscale x 4 x bfloat> %va to <vscale x 4 x float>
   ret <vscale x 4 x float> %evec
@@ -248,8 +248,8 @@ define <vscale x 8 x float> @vfpext_nxv8bf16_nxv8f32(<vscale x 8 x bfloat> %va)
 ; CHECK-LABEL: vfpext_nxv8bf16_nxv8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = fpext <vscale x 8 x bfloat> %va to <vscale x 8 x float>
   ret <vscale x 8 x float> %evec
@@ -273,8 +273,8 @@ define <vscale x 16 x float> @vfpext_nxv16bf16_nxv16f32(<vscale x 16 x bfloat> %
 ; CHECK-LABEL: vfpext_nxv16bf16_nxv16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = fpext <vscale x 16 x bfloat> %va to <vscale x 16 x float>
   ret <vscale x 16 x float> %evec
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
index 6debc47460683..137b616d86fcc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
@@ -60,8 +60,8 @@ define <vscale x 2 x double> @vfpext_nxv2f32_nxv2f64(<vscale x 2 x float> %a, <v
 ; CHECK-LABEL: vfpext_nxv2f32_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> %m, i32 %vl)
   ret <vscale x 2 x double> %v
@@ -71,8 +71,8 @@ define <vscale x 2 x double> @vfpext_nxv2f32_nxv2f64_unmasked(<vscale x 2 x floa
 ; CHECK-LABEL: vfpext_nxv2f32_nxv2f64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x double> %v
@@ -84,8 +84,8 @@ define <vscale x 7 x double> @vfpext_nxv7f32_nxv7f64(<vscale x 7 x float> %a, <v
 ; CHECK-LABEL: vfpext_nxv7f32_nxv7f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.fpext.nxv7f64.nxv7f32(<vscale x 7 x float> %a, <vscale x 7 x i1> %m, i32 %vl)
   ret <vscale x 7 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-constrained-sdnode.ll
index 6ebdcec4a403c..9e7f4ede29f54 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-constrained-sdnode.ll
@@ -349,8 +349,8 @@ define <vscale x 4 x i32> @vfptosi_nxv4f16_nxv4i32(<vscale x 4 x half> %va) stri
 ; CHECK-LABEL: vfptosi_nxv4f16_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 4 x i32> @llvm.experimental.constrained.fptosi.nxv4i32.nxv4f16(<vscale x 4 x half> %va, metadata !"fpexcept.strict")
   ret <vscale x 4 x i32> %evec
@@ -361,8 +361,8 @@ define <vscale x 4 x i32> @vfptoui_nxv4f16_nxv4i32(<vscale x 4 x half> %va) stri
 ; CHECK-LABEL: vfptoui_nxv4f16_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 4 x i32> @llvm.experimental.constrained.fptoui.nxv4i32.nxv4f16(<vscale x 4 x half> %va, metadata !"fpexcept.strict")
   ret <vscale x 4 x i32> %evec
@@ -471,8 +471,8 @@ define <vscale x 8 x i32> @vfptosi_nxv8f16_nxv8i32(<vscale x 8 x half> %va) stri
 ; CHECK-LABEL: vfptosi_nxv8f16_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x i32> @llvm.experimental.constrained.fptosi.nxv8i32.nxv8f16(<vscale x 8 x half> %va, metadata !"fpexcept.strict")
   ret <vscale x 8 x i32> %evec
@@ -483,8 +483,8 @@ define <vscale x 8 x i32> @vfptoui_nxv8f16_nxv8i32(<vscale x 8 x half> %va) stri
 ; CHECK-LABEL: vfptoui_nxv8f16_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x i32> @llvm.experimental.constrained.fptoui.nxv8i32.nxv8f16(<vscale x 8 x half> %va, metadata !"fpexcept.strict")
   ret <vscale x 8 x i32> %evec
@@ -593,8 +593,8 @@ define <vscale x 16 x i32> @vfptosi_nxv16f16_nxv16i32(<vscale x 16 x half> %va)
 ; CHECK-LABEL: vfptosi_nxv16f16_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 16 x i32> @llvm.experimental.constrained.fptosi.nxv16i32.nxv16f16(<vscale x 16 x half> %va, metadata !"fpexcept.strict")
   ret <vscale x 16 x i32> %evec
@@ -605,8 +605,8 @@ define <vscale x 16 x i32> @vfptoui_nxv16f16_nxv16i32(<vscale x 16 x half> %va)
 ; CHECK-LABEL: vfptoui_nxv16f16_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 16 x i32> @llvm.experimental.constrained.fptoui.nxv16i32.nxv16f16(<vscale x 16 x half> %va, metadata !"fpexcept.strict")
   ret <vscale x 16 x i32> %evec
@@ -909,8 +909,8 @@ define <vscale x 2 x i64> @vfptosi_nxv2f32_nxv2i64(<vscale x 2 x float> %va) str
 ; CHECK-LABEL: vfptosi_nxv2f32_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 2 x i64> @llvm.experimental.constrained.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> %va, metadata !"fpexcept.strict")
   ret <vscale x 2 x i64> %evec
@@ -921,8 +921,8 @@ define <vscale x 2 x i64> @vfptoui_nxv2f32_nxv2i64(<vscale x 2 x float> %va) str
 ; CHECK-LABEL: vfptoui_nxv2f32_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 2 x i64> @llvm.experimental.constrained.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> %va, metadata !"fpexcept.strict")
   ret <vscale x 2 x i64> %evec
@@ -1031,8 +1031,8 @@ define <vscale x 4 x i64> @vfptosi_nxv4f32_nxv4i64(<vscale x 4 x float> %va) str
 ; CHECK-LABEL: vfptosi_nxv4f32_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 4 x i64> @llvm.experimental.constrained.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> %va, metadata !"fpexcept.strict")
   ret <vscale x 4 x i64> %evec
@@ -1043,8 +1043,8 @@ define <vscale x 4 x i64> @vfptoui_nxv4f32_nxv4i64(<vscale x 4 x float> %va) str
 ; CHECK-LABEL: vfptoui_nxv4f32_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 4 x i64> @llvm.experimental.constrained.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> %va, metadata !"fpexcept.strict")
   ret <vscale x 4 x i64> %evec
@@ -1153,8 +1153,8 @@ define <vscale x 8 x i64> @vfptosi_nxv8f32_nxv8i64(<vscale x 8 x float> %va) str
 ; CHECK-LABEL: vfptosi_nxv8f32_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x i64> @llvm.experimental.constrained.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> %va, metadata !"fpexcept.strict")
   ret <vscale x 8 x i64> %evec
@@ -1165,8 +1165,8 @@ define <vscale x 8 x i64> @vfptoui_nxv8f32_nxv8i64(<vscale x 8 x float> %va) str
 ; CHECK-LABEL: vfptoui_nxv8f32_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x i64> @llvm.experimental.constrained.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> %va, metadata !"fpexcept.strict")
   ret <vscale x 8 x i64> %evec
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
index d707b4254d3e1..df7af4d8b1667 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
@@ -1246,8 +1246,8 @@ define <vscale x 4 x i32> @vfptosi_nxv4f16_nxv4i32(<vscale x 4 x half> %va) {
 ; ZVFH-LABEL: vfptosi_nxv4f16_nxv4i32:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFH-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; ZVFH-NEXT:    vmv2r.v v8, v10
+; ZVFH-NEXT:    vmv1r.v v10, v8
+; ZVFH-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv4f16_nxv4i32:
@@ -1265,8 +1265,8 @@ define <vscale x 4 x i32> @vfptoui_nxv4f16_nxv4i32(<vscale x 4 x half> %va) {
 ; ZVFH-LABEL: vfptoui_nxv4f16_nxv4i32:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFH-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; ZVFH-NEXT:    vmv2r.v v8, v10
+; ZVFH-NEXT:    vmv1r.v v10, v8
+; ZVFH-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv4f16_nxv4i32:
@@ -1424,8 +1424,8 @@ define <vscale x 8 x i32> @vfptosi_nxv8f16_nxv8i32(<vscale x 8 x half> %va) {
 ; ZVFH-LABEL: vfptosi_nxv8f16_nxv8i32:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFH-NEXT:    vfwcvt.rtz.x.f.v v12, v8
-; ZVFH-NEXT:    vmv4r.v v8, v12
+; ZVFH-NEXT:    vmv2r.v v12, v8
+; ZVFH-NEXT:    vfwcvt.rtz.x.f.v v8, v12
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv8f16_nxv8i32:
@@ -1443,8 +1443,8 @@ define <vscale x 8 x i32> @vfptoui_nxv8f16_nxv8i32(<vscale x 8 x half> %va) {
 ; ZVFH-LABEL: vfptoui_nxv8f16_nxv8i32:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFH-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
-; ZVFH-NEXT:    vmv4r.v v8, v12
+; ZVFH-NEXT:    vmv2r.v v12, v8
+; ZVFH-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv8f16_nxv8i32:
@@ -1602,8 +1602,8 @@ define <vscale x 16 x i32> @vfptosi_nxv16f16_nxv16i32(<vscale x 16 x half> %va)
 ; ZVFH-LABEL: vfptosi_nxv16f16_nxv16i32:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvt.rtz.x.f.v v16, v8
-; ZVFH-NEXT:    vmv8r.v v8, v16
+; ZVFH-NEXT:    vmv4r.v v16, v8
+; ZVFH-NEXT:    vfwcvt.rtz.x.f.v v8, v16
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv16f16_nxv16i32:
@@ -1621,8 +1621,8 @@ define <vscale x 16 x i32> @vfptoui_nxv16f16_nxv16i32(<vscale x 16 x half> %va)
 ; ZVFH-LABEL: vfptoui_nxv16f16_nxv16i32:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvt.rtz.xu.f.v v16, v8
-; ZVFH-NEXT:    vmv8r.v v8, v16
+; ZVFH-NEXT:    vmv4r.v v16, v8
+; ZVFH-NEXT:    vfwcvt.rtz.xu.f.v v8, v16
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv16f16_nxv16i32:
@@ -1990,8 +1990,8 @@ define <vscale x 2 x i64> @vfptosi_nxv2f32_nxv2i64(<vscale x 2 x float> %va) {
 ; CHECK-LABEL: vfptosi_nxv2f32_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = fptosi <vscale x 2 x float> %va to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %evec
@@ -2001,8 +2001,8 @@ define <vscale x 2 x i64> @vfptoui_nxv2f32_nxv2i64(<vscale x 2 x float> %va) {
 ; CHECK-LABEL: vfptoui_nxv2f32_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = fptoui <vscale x 2 x float> %va to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %evec
@@ -2102,8 +2102,8 @@ define <vscale x 4 x i64> @vfptosi_nxv4f32_nxv4i64(<vscale x 4 x float> %va) {
 ; CHECK-LABEL: vfptosi_nxv4f32_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = fptosi <vscale x 4 x float> %va to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %evec
@@ -2113,8 +2113,8 @@ define <vscale x 4 x i64> @vfptoui_nxv4f32_nxv4i64(<vscale x 4 x float> %va) {
 ; CHECK-LABEL: vfptoui_nxv4f32_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = fptoui <vscale x 4 x float> %va to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %evec
@@ -2214,8 +2214,8 @@ define <vscale x 8 x i64> @vfptosi_nxv8f32_nxv8i64(<vscale x 8 x float> %va) {
 ; CHECK-LABEL: vfptosi_nxv8f32_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = fptosi <vscale x 8 x float> %va to <vscale x 8 x i64>
   ret <vscale x 8 x i64> %evec
@@ -2225,8 +2225,8 @@ define <vscale x 8 x i64> @vfptoui_nxv8f32_nxv8i64(<vscale x 8 x float> %va) {
 ; CHECK-LABEL: vfptoui_nxv8f32_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = fptoui <vscale x 8 x float> %va to <vscale x 8 x i64>
   ret <vscale x 8 x i64> %evec
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index 402a28396a303..f94f709626443 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -365,8 +365,8 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f32(<vscale x 2 x float> %va, <vs
 ; CHECK-LABEL: vfptosi_nxv2i64_nxv2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i64> %v
@@ -376,8 +376,8 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-LABEL: vfptosi_nxv2i64_nxv2f32_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
@@ -491,8 +491,7 @@ define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-LABEL: vfptosi_nxv32i16_nxv32f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    vmv8r.v v24, v16
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    slli a1, a1, 1
@@ -502,15 +501,15 @@ define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v20, v24, v0.t
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v28, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB34_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v16, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v24, v8, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i16> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index 51f7cec96e85c..7aae383049deb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -365,8 +365,8 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f32(<vscale x 2 x float> %va, <vs
 ; CHECK-LABEL: vfptoui_nxv2i64_nxv2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i64> %v
@@ -376,8 +376,8 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-LABEL: vfptoui_nxv2i64_nxv2f32_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
@@ -491,8 +491,7 @@ define <vscale x 32 x i16> @vfptoui_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-LABEL: vfptoui_nxv32i16_nxv32f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    vmv8r.v v24, v16
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    slli a1, a1, 1
@@ -502,15 +501,15 @@ define <vscale x 32 x i16> @vfptoui_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.xu.f.w v20, v24, v0.t
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v28, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB34_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.xu.f.w v16, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v24, v8, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i16> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index 3adb84f8da20f..5c5abed707257 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -97,8 +97,7 @@ define <vscale x 16 x float> @vfptrunc_nxv16f32_nxv16f64(<vscale x 16 x double>
 ; CHECK-LABEL: vfptrunc_nxv16f32_nxv16f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    vmv8r.v v24, v16
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
@@ -107,15 +106,15 @@ define <vscale x 16 x float> @vfptrunc_nxv16f32_nxv16f64(<vscale x 16 x double>
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v20, v24, v0.t
+; CHECK-NEXT:    vfncvt.f.f.w v28, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB7_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB7_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v16, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vfncvt.f.f.w v24, v8, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.fptrunc.nxv16f64.nxv16f32(<vscale x 16 x double> %a, <vscale x 16 x i1> %m, i32 %vl)
   ret <vscale x 16 x float> %v
@@ -142,7 +141,7 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vl8re64.v v8, (a0)
+; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    srli a5, a1, 3
 ; CHECK-NEXT:    slli a4, a1, 3
 ; CHECK-NEXT:    slli a3, a1, 1
@@ -156,20 +155,20 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    and a4, a7, a4
 ; CHECK-NEXT:    srli a7, a1, 2
-; CHECK-NEXT:    vl8re64.v v24, (a6)
-; CHECK-NEXT:    vslidedown.vx v16, v0, a7
+; CHECK-NEXT:    vl8re64.v v8, (a6)
+; CHECK-NEXT:    vslidedown.vx v24, v0, a7
 ; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a5
-; CHECK-NEXT:    vslidedown.vx v0, v16, a5
+; CHECK-NEXT:    vslidedown.vx v0, v24, a5
 ; CHECK-NEXT:    bltu a0, a1, .LBB8_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB8_2:
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v20, v24, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfncvt.f.f.w v28, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v16, v8, v0.t
+; CHECK-NEXT:    vfncvt.f.f.w v24, v16, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB8_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a3
@@ -180,9 +179,9 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v12, v24, v0.t
+; CHECK-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a2, a1, .LBB8_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a1
@@ -192,9 +191,10 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v8, v24, v0.t
+; CHECK-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll
index fbb14ab43e075..e6493ea229a50 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll
@@ -103,11 +103,11 @@ define <vscale x 4 x bfloat> @vfsub_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsub.vv v10, v10, v12
+; CHECK-NEXT:    vfsub.vv v10, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
@@ -138,11 +138,11 @@ define <vscale x 8 x bfloat> @vfsub_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmv.v.x v16, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfsub.vv v12, v12, v16
+; CHECK-NEXT:    vfsub.vv v12, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -157,11 +157,11 @@ define <vscale x 8 x bfloat> @vfsub_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmv.v.x v16, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfsub.vv v12, v16, v12
+; CHECK-NEXT:    vfsub.vv v12, v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -192,11 +192,11 @@ define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmv.v.x v24, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v16, v24
+; CHECK-NEXT:    vfsub.vv v16, v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
@@ -416,11 +416,11 @@ define <vscale x 4 x half> @vfsub_vf_nxv4f16(<vscale x 4 x half> %va, half %b) s
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -464,11 +464,11 @@ define <vscale x 8 x half> @vfsub_vf_nxv8f16(<vscale x 8 x half> %va, half %b) s
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -489,11 +489,11 @@ define <vscale x 8 x half> @vfsub_fv_nxv8f16(<vscale x 8 x half> %va, half %b) s
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
+; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v12, v16, v12
+; ZVFHMIN-NEXT:    vfsub.vv v12, v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -537,11 +537,11 @@ define <vscale x 16 x half> @vfsub_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a0
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index 2dd4fea82f9d6..70317e9d55cc7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -189,11 +189,11 @@ define <vscale x 4 x bfloat> @vfsub_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsub.vv v10, v10, v12, v0.t
+; CHECK-NEXT:    vfsub.vv v10, v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
 ; CHECK-NEXT:    ret
@@ -208,11 +208,11 @@ define <vscale x 4 x bfloat> @vfsub_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfsub.vv v10, v10, v12
+; CHECK-NEXT:    vfsub.vv v10, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    ret
@@ -259,11 +259,11 @@ define <vscale x 8 x bfloat> @vfsub_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfsub.vv v12, v12, v16, v0.t
+; CHECK-NEXT:    vfsub.vv v12, v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
 ; CHECK-NEXT:    ret
@@ -278,11 +278,11 @@ define <vscale x 8 x bfloat> @vfsub_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfsub.vv v12, v12, v16
+; CHECK-NEXT:    vfsub.vv v12, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -329,11 +329,11 @@ define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vmv.v.x v24, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vfsub.vv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
 ; CHECK-NEXT:    ret
@@ -348,11 +348,11 @@ define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vmv.v.x v24, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v16, v24
+; CHECK-NEXT:    vfsub.vv v16, v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
@@ -837,11 +837,11 @@ define <vscale x 4 x half> @vfsub_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -862,11 +862,11 @@ define <vscale x 4 x half> @vfsub_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12
+; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -931,11 +931,11 @@ define <vscale x 8 x half> @vfsub_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -956,11 +956,11 @@ define <vscale x 8 x half> @vfsub_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16
+; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -1025,11 +1025,11 @@ define <vscale x 16 x half> @vfsub_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -1050,11 +1050,11 @@ define <vscale x 16 x half> @vfsub_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll
index f7d287a088cc3..8d41514161c61 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll
@@ -88,8 +88,9 @@ define <vscale x 2 x double> @vfwadd_vv_nxv2f64(<vscale x 2 x float> %va, <vscal
 ; CHECK-LABEL: vfwadd_vv_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwadd.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vfwadd.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %vc = fpext <vscale x 2 x float> %va to <vscale x 2 x double>
   %vd = fpext <vscale x 2 x float> %vb to <vscale x 2 x double>
@@ -101,8 +102,8 @@ define <vscale x 2 x double> @vfwadd_vf_nxv2f64(<vscale x 2 x float> %va, float
 ; CHECK-LABEL: vfwadd_vf_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwadd.vf v10, v8, fa0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwadd.vf v8, v10, fa0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 2 x float> %head, <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer
@@ -116,8 +117,8 @@ define <vscale x 2 x double> @vfwadd_vf_nxv2f64_2(<vscale x 2 x float> %va, floa
 ; CHECK-LABEL: vfwadd_vf_nxv2f64_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwadd.vf v10, v8, fa0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwadd.vf v8, v10, fa0
 ; CHECK-NEXT:    ret
   %fpext = fpext float %b to double
   %head = insertelement <vscale x 2 x double> poison, double %fpext, i32 0
@@ -168,8 +169,9 @@ define <vscale x 4 x double> @vfwadd_vv_nxv4f64(<vscale x 4 x float> %va, <vscal
 ; CHECK-LABEL: vfwadd_vv_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwadd.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vfwadd.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %vc = fpext <vscale x 4 x float> %va to <vscale x 4 x double>
   %vd = fpext <vscale x 4 x float> %vb to <vscale x 4 x double>
@@ -181,8 +183,8 @@ define <vscale x 4 x double> @vfwadd_vf_nxv4f64(<vscale x 4 x float> %va, float
 ; CHECK-LABEL: vfwadd_vf_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwadd.vf v12, v8, fa0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwadd.vf v8, v12, fa0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -196,8 +198,8 @@ define <vscale x 4 x double> @vfwadd_vf_nxv4f64_2(<vscale x 4 x float> %va, floa
 ; CHECK-LABEL: vfwadd_vf_nxv4f64_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwadd.vf v12, v8, fa0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwadd.vf v8, v12, fa0
 ; CHECK-NEXT:    ret
   %fpext = fpext float %b to double
   %head = insertelement <vscale x 4 x double> poison, double %fpext, i32 0
@@ -248,8 +250,9 @@ define <vscale x 8 x double> @vfwadd_vv_nxv8f64(<vscale x 8 x float> %va, <vscal
 ; CHECK-LABEL: vfwadd_vv_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwadd.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vfwadd.vv v8, v20, v16
 ; CHECK-NEXT:    ret
   %vc = fpext <vscale x 8 x float> %va to <vscale x 8 x double>
   %vd = fpext <vscale x 8 x float> %vb to <vscale x 8 x double>
@@ -261,8 +264,8 @@ define <vscale x 8 x double> @vfwadd_vf_nxv8f64(<vscale x 8 x float> %va, float
 ; CHECK-LABEL: vfwadd_vf_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwadd.vf v16, v8, fa0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwadd.vf v8, v16, fa0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -276,8 +279,8 @@ define <vscale x 8 x double> @vfwadd_vf_nxv8f64_2(<vscale x 8 x float> %va, floa
 ; CHECK-LABEL: vfwadd_vf_nxv8f64_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwadd.vf v16, v8, fa0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwadd.vf v8, v16, fa0
 ; CHECK-NEXT:    ret
   %fpext = fpext float %b to double
   %head = insertelement <vscale x 8 x double> poison, double %fpext, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd.ll
index d980803cb3897..df8ece342d9ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd.ll
@@ -115,11 +115,12 @@ declare <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4f16.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwadd_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwadd.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vv v8, v11, v10
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4f16.nxv4f16(
@@ -166,11 +167,12 @@ declare <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8f16.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwadd_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwadd.vv v12, v8, v10
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vv v8, v14, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8f16.nxv8f16(
@@ -217,11 +219,12 @@ declare <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16f16.nxv16f16(
 define <vscale x 16 x float> @intrinsic_vfwadd_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwadd.vv v16, v8, v12
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vv v8, v20, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16f16.nxv16f16(
@@ -319,11 +322,12 @@ declare <vscale x 2 x double> @llvm.riscv.vfwadd.nxv2f64.nxv2f32.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwadd_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwadd.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vv v8, v11, v10
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwadd.nxv2f64.nxv2f32.nxv2f32(
@@ -370,11 +374,12 @@ declare <vscale x 4 x double> @llvm.riscv.vfwadd.nxv4f64.nxv4f32.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwadd_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwadd.vv v12, v8, v10
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vv v8, v14, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwadd.nxv4f64.nxv4f32.nxv4f32(
@@ -421,11 +426,12 @@ declare <vscale x 8 x double> @llvm.riscv.vfwadd.nxv8f64.nxv8f32.nxv8f32(
 define <vscale x 8 x double> @intrinsic_vfwadd_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwadd.vv v16, v8, v12
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vv v8, v20, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwadd.nxv8f64.nxv8f32.nxv8f32(
@@ -574,11 +580,11 @@ declare <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4f16.f16(
 define <vscale x 4 x float> @intrinsic_vfwadd_vf_nxv4f32_nxv4f16_f16(<vscale x 4 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv4f32_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwadd.vf v10, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vf v8, v10, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4f16.f16(
@@ -625,11 +631,11 @@ declare <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8f16.f16(
 define <vscale x 8 x float> @intrinsic_vfwadd_vf_nxv8f32_nxv8f16_f16(<vscale x 8 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv8f32_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwadd.vf v12, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vf v8, v12, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8f16.f16(
@@ -676,11 +682,11 @@ declare <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16f16.f16(
 define <vscale x 16 x float> @intrinsic_vfwadd_vf_nxv16f32_nxv16f16_f16(<vscale x 16 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv16f32_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwadd.vf v16, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vf v8, v16, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16f16.f16(
@@ -778,11 +784,11 @@ declare <vscale x 2 x double> @llvm.riscv.vfwadd.nxv2f64.nxv2f32.f32(
 define <vscale x 2 x double> @intrinsic_vfwadd_vf_nxv2f64_nxv2f32_f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv2f64_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwadd.vf v10, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vf v8, v10, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwadd.nxv2f64.nxv2f32.f32(
@@ -829,11 +835,11 @@ declare <vscale x 4 x double> @llvm.riscv.vfwadd.nxv4f64.nxv4f32.f32(
 define <vscale x 4 x double> @intrinsic_vfwadd_vf_nxv4f64_nxv4f32_f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv4f64_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwadd.vf v12, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vf v8, v12, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwadd.nxv4f64.nxv4f32.f32(
@@ -880,11 +886,11 @@ declare <vscale x 8 x double> @llvm.riscv.vfwadd.nxv8f64.nxv8f32.f32(
 define <vscale x 8 x double> @intrinsic_vfwadd_vf_nxv8f64_nxv8f32_f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv8f64_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwadd.vf v16, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vf v8, v16, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwadd.nxv8f64.nxv8f32.f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll
index 8eb2a2c0391b5..8e97ba581d55c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll
@@ -1289,11 +1289,11 @@ entry:
 define <vscale x 4 x float> @intrinsic_vfwadd.w_wv_untie_nxv4f32_nxv4f32_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv4f32_nxv4f32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwadd.wv v12, v10, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.wv v8, v10, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4f16(
@@ -1308,11 +1308,11 @@ entry:
 define <vscale x 8 x float> @intrinsic_vfwadd.w_wv_untie_nxv8f32_nxv8f32_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv8f32_nxv8f32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwadd.wv v16, v12, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.wv v8, v12, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8f16(
@@ -1346,11 +1346,11 @@ entry:
 define <vscale x 2 x double> @intrinsic_vfwadd.w_wv_untie_nxv2f64_nxv2f64_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv2f64_nxv2f64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwadd.wv v12, v10, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.wv v8, v10, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwadd.w.nxv2f64.nxv2f32(
@@ -1365,11 +1365,11 @@ entry:
 define <vscale x 4 x double> @intrinsic_vfwadd.w_wv_untie_nxv4f64_nxv4f64_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv4f64_nxv4f64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwadd.wv v16, v12, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.wv v8, v12, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwadd.w.nxv4f64.nxv4f32(
@@ -1384,11 +1384,11 @@ entry:
 define <vscale x 8 x double> @intrinsic_vfwadd.w_wv_untie_nxv8f64_nxv8f64_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv8f64_nxv8f64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwadd.wv v24, v16, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.wv v8, v16, v24
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwadd.w.nxv8f64.nxv8f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-f.ll
index 4f03188cf3806..0ac2d472c70fb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-f.ll
@@ -102,8 +102,8 @@ define <vscale x 4 x float> @intrinsic_vfwcvt_f.f.v_nxv4f32_nxv4f16(<vscale x 4
 ; CHECK-LABEL: intrinsic_vfwcvt_f.f.v_nxv4f32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4f16(
@@ -145,8 +145,8 @@ define <vscale x 8 x float> @intrinsic_vfwcvt_f.f.v_nxv8f32_nxv8f16(<vscale x 8
 ; CHECK-LABEL: intrinsic_vfwcvt_f.f.v_nxv8f32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8f16(
@@ -188,8 +188,8 @@ define <vscale x 16 x float> @intrinsic_vfwcvt_f.f.v_nxv16f32_nxv16f16(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_f.f.v_nxv16f32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16f16(
@@ -274,8 +274,8 @@ define <vscale x 2 x double> @intrinsic_vfwcvt_f.f.v_nxv2f64_nxv2f32(<vscale x 2
 ; CHECK-LABEL: intrinsic_vfwcvt_f.f.v_nxv2f64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.f.v.nxv2f64.nxv2f32(
@@ -317,8 +317,8 @@ define <vscale x 4 x double> @intrinsic_vfwcvt_f.f.v_nxv4f64_nxv4f32(<vscale x 4
 ; CHECK-LABEL: intrinsic_vfwcvt_f.f.v_nxv4f64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.f.v.nxv4f64.nxv4f32(
@@ -360,8 +360,8 @@ define <vscale x 8 x double> @intrinsic_vfwcvt_f.f.v_nxv8f64_nxv8f32(<vscale x 8
 ; CHECK-LABEL: intrinsic_vfwcvt_f.f.v_nxv8f64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.f.v.nxv8f64.nxv8f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-x.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-x.ll
index 3c1e07b4a5ef4..cb6db0de733a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-x.ll
@@ -142,8 +142,8 @@ define <vscale x 8 x half> @intrinsic_vfwcvt_f.x.v_nxv8f16_nxv8i8(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv8f16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfwcvt.f.x.v.nxv8f16.nxv8i8(
@@ -185,8 +185,8 @@ define <vscale x 16 x half> @intrinsic_vfwcvt_f.x.v_nxv16f16_nxv16i8(<vscale x 1
 ; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv16f16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfwcvt.f.x.v.nxv16f16.nxv16i8(
@@ -228,8 +228,8 @@ define <vscale x 32 x half> @intrinsic_vfwcvt_f.x.v_nxv32f16_nxv32i8(<vscale x 3
 ; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv32f16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfwcvt.f.x.v.nxv32f16.nxv32i8(
@@ -357,8 +357,8 @@ define <vscale x 4 x float> @intrinsic_vfwcvt_f.x.v_nxv4f32_nxv4i16(<vscale x 4
 ; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv4f32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.x.v.nxv4f32.nxv4i16(
@@ -400,8 +400,8 @@ define <vscale x 8 x float> @intrinsic_vfwcvt_f.x.v_nxv8f32_nxv8i16(<vscale x 8
 ; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv8f32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.x.v.nxv8f32.nxv8i16(
@@ -443,8 +443,8 @@ define <vscale x 16 x float> @intrinsic_vfwcvt_f.x.v_nxv16f32_nxv16i16(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv16f32_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.x.v.nxv16f32.nxv16i16(
@@ -529,8 +529,8 @@ define <vscale x 2 x double> @intrinsic_vfwcvt_f.x.v_nxv2f64_nxv2i32(<vscale x 2
 ; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv2f64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.x.v.nxv2f64.nxv2i32(
@@ -572,8 +572,8 @@ define <vscale x 4 x double> @intrinsic_vfwcvt_f.x.v_nxv4f64_nxv4i32(<vscale x 4
 ; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv4f64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.x.v.nxv4f64.nxv4i32(
@@ -615,8 +615,8 @@ define <vscale x 8 x double> @intrinsic_vfwcvt_f.x.v_nxv8f64_nxv8i32(<vscale x 8
 ; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv8f64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.x.v.nxv8f64.nxv8i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-xu.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-xu.ll
index 17ea8f50a6943..59dacdaa82ea3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-xu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-xu.ll
@@ -142,8 +142,8 @@ define <vscale x 8 x half> @intrinsic_vfwcvt_f.xu.v_nxv8f16_nxv8i8(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv8f16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfwcvt.f.xu.v.nxv8f16.nxv8i8(
@@ -185,8 +185,8 @@ define <vscale x 16 x half> @intrinsic_vfwcvt_f.xu.v_nxv16f16_nxv16i8(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv16f16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfwcvt.f.xu.v.nxv16f16.nxv16i8(
@@ -228,8 +228,8 @@ define <vscale x 32 x half> @intrinsic_vfwcvt_f.xu.v_nxv32f16_nxv32i8(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv32f16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfwcvt.f.xu.v.nxv32f16.nxv32i8(
@@ -357,8 +357,8 @@ define <vscale x 4 x float> @intrinsic_vfwcvt_f.xu.v_nxv4f32_nxv4i16(<vscale x 4
 ; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv4f32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.xu.v.nxv4f32.nxv4i16(
@@ -400,8 +400,8 @@ define <vscale x 8 x float> @intrinsic_vfwcvt_f.xu.v_nxv8f32_nxv8i16(<vscale x 8
 ; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv8f32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.xu.v.nxv8f32.nxv8i16(
@@ -443,8 +443,8 @@ define <vscale x 16 x float> @intrinsic_vfwcvt_f.xu.v_nxv16f32_nxv16i16(<vscale
 ; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv16f32_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.xu.v.nxv16f32.nxv16i16(
@@ -529,8 +529,8 @@ define <vscale x 2 x double> @intrinsic_vfwcvt_f.xu.v_nxv2f64_nxv2i32(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv2f64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwcvt.f.xu.v.nxv2f64.nxv2i32(
@@ -572,8 +572,8 @@ define <vscale x 4 x double> @intrinsic_vfwcvt_f.xu.v_nxv4f64_nxv4i32(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv4f64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwcvt.f.xu.v.nxv4f64.nxv4i32(
@@ -615,8 +615,8 @@ define <vscale x 8 x double> @intrinsic_vfwcvt_f.xu.v_nxv8f64_nxv8i32(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv8f64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwcvt.f.xu.v.nxv8f64.nxv8i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-x-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-x-f.ll
index f9f426cd3c9b6..3d1fdda3af3ad 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-x-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-x-f.ll
@@ -101,8 +101,8 @@ define <vscale x 4 x i32> @intrinsic_vfwcvt_rtz.x.f.v_nxv4i32_nxv4f16(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_rtz.x.f.v_nxv4i32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vfwcvt.rtz.x.f.v.nxv4i32.nxv4f16(
@@ -145,8 +145,8 @@ define <vscale x 8 x i32> @intrinsic_vfwcvt_rtz.x.f.v_nxv8i32_nxv8f16(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_rtz.x.f.v_nxv8i32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vfwcvt.rtz.x.f.v.nxv8i32.nxv8f16(
@@ -189,8 +189,8 @@ define <vscale x 16 x i32> @intrinsic_vfwcvt_rtz.x.f.v_nxv16i32_nxv16f16(<vscale
 ; CHECK-LABEL: intrinsic_vfwcvt_rtz.x.f.v_nxv16i32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vfwcvt.rtz.x.f.v.nxv16i32.nxv16f16(
@@ -277,8 +277,8 @@ define <vscale x 2 x i64> @intrinsic_vfwcvt_rtz.x.f.v_nxv2i64_nxv2f32(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_rtz.x.f.v_nxv2i64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vfwcvt.rtz.x.f.v.nxv2i64.nxv2f32(
@@ -321,8 +321,8 @@ define <vscale x 4 x i64> @intrinsic_vfwcvt_rtz.x.f.v_nxv4i64_nxv4f32(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_rtz.x.f.v_nxv4i64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vfwcvt.rtz.x.f.v.nxv4i64.nxv4f32(
@@ -365,8 +365,8 @@ define <vscale x 8 x i64> @intrinsic_vfwcvt_rtz.x.f.v_nxv8i64_nxv8f32(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_rtz.x.f.v_nxv8i64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vfwcvt.rtz.x.f.v.nxv8i64.nxv8f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-xu-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-xu-f.ll
index 61d2ad5bf892a..8eed12f2d2e72 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-xu-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-xu-f.ll
@@ -101,8 +101,8 @@ define <vscale x 4 x i32> @intrinsic_vfwcvt_rtz.xu.f.v_nxv4i32_nxv4f16(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_rtz.xu.f.v_nxv4i32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vfwcvt.rtz.xu.f.v.nxv4i32.nxv4f16(
@@ -145,8 +145,8 @@ define <vscale x 8 x i32> @intrinsic_vfwcvt_rtz.xu.f.v_nxv8i32_nxv8f16(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_rtz.xu.f.v_nxv8i32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vfwcvt.rtz.xu.f.v.nxv8i32.nxv8f16(
@@ -189,8 +189,8 @@ define <vscale x 16 x i32> @intrinsic_vfwcvt_rtz.xu.f.v_nxv16i32_nxv16f16(<vscal
 ; CHECK-LABEL: intrinsic_vfwcvt_rtz.xu.f.v_nxv16i32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vfwcvt.rtz.xu.f.v.nxv16i32.nxv16f16(
@@ -277,8 +277,8 @@ define <vscale x 2 x i64> @intrinsic_vfwcvt_rtz.xu.f.v_nxv2i64_nxv2f32(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_rtz.xu.f.v_nxv2i64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vfwcvt.rtz.xu.f.v.nxv2i64.nxv2f32(
@@ -321,8 +321,8 @@ define <vscale x 4 x i64> @intrinsic_vfwcvt_rtz.xu.f.v_nxv4i64_nxv4f32(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_rtz.xu.f.v_nxv4i64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vfwcvt.rtz.xu.f.v.nxv4i64.nxv4f32(
@@ -365,8 +365,8 @@ define <vscale x 8 x i64> @intrinsic_vfwcvt_rtz.xu.f.v_nxv8i64_nxv8f32(<vscale x
 ; CHECK-LABEL: intrinsic_vfwcvt_rtz.xu.f.v_nxv8i64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vfwcvt.rtz.xu.f.v.nxv8i64.nxv8f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll
index 8b545585c56d0..da52e05a45f11 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll
@@ -106,11 +106,11 @@ declare <vscale x 4 x i32> @llvm.riscv.vfwcvt.x.f.v.nxv4i32.nxv4f16(
 define <vscale x 4 x i32> @intrinsic_vfwcvt_x.f.v_nxv4i32_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv4i32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v10, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v10
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vfwcvt.x.f.v.nxv4i32.nxv4f16(
@@ -153,11 +153,11 @@ declare <vscale x 8 x i32> @llvm.riscv.vfwcvt.x.f.v.nxv8i32.nxv8f16(
 define <vscale x 8 x i32> @intrinsic_vfwcvt_x.f.v_nxv8i32_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv8i32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v12, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vfwcvt.x.f.v.nxv8i32.nxv8f16(
@@ -200,11 +200,11 @@ declare <vscale x 16 x i32> @llvm.riscv.vfwcvt.x.f.v.nxv16i32.nxv16f16(
 define <vscale x 16 x i32> @intrinsic_vfwcvt_x.f.v_nxv16i32_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv16i32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v16, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vfwcvt.x.f.v.nxv16i32.nxv16f16(
@@ -294,11 +294,11 @@ declare <vscale x 2 x i64> @llvm.riscv.vfwcvt.x.f.v.nxv2i64.nxv2f32(
 define <vscale x 2 x i64> @intrinsic_vfwcvt_x.f.v_nxv2i64_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv2i64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v10, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v10
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vfwcvt.x.f.v.nxv2i64.nxv2f32(
@@ -341,11 +341,11 @@ declare <vscale x 4 x i64> @llvm.riscv.vfwcvt.x.f.v.nxv4i64.nxv4f32(
 define <vscale x 4 x i64> @intrinsic_vfwcvt_x.f.v_nxv4i64_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv4i64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v12, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vfwcvt.x.f.v.nxv4i64.nxv4f32(
@@ -388,11 +388,11 @@ declare <vscale x 8 x i64> @llvm.riscv.vfwcvt.x.f.v.nxv8i64.nxv8f32(
 define <vscale x 8 x i64> @intrinsic_vfwcvt_x.f.v_nxv8i64_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv8i64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.x.f.v v16, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwcvt.x.f.v v8, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vfwcvt.x.f.v.nxv8i64.nxv8f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll
index 476e2398c479d..01a82998d617c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll
@@ -106,11 +106,11 @@ declare <vscale x 4 x i32> @llvm.riscv.vfwcvt.xu.f.v.nxv4i32.nxv4f16(
 define <vscale x 4 x i32> @intrinsic_vfwcvt_xu.f.v_nxv4i32_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv4i32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.xu.f.v v10, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwcvt.xu.f.v v8, v10
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vfwcvt.xu.f.v.nxv4i32.nxv4f16(
@@ -153,11 +153,11 @@ declare <vscale x 8 x i32> @llvm.riscv.vfwcvt.xu.f.v.nxv8i32.nxv8f16(
 define <vscale x 8 x i32> @intrinsic_vfwcvt_xu.f.v_nxv8i32_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv8i32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.xu.f.v v12, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwcvt.xu.f.v v8, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vfwcvt.xu.f.v.nxv8i32.nxv8f16(
@@ -200,11 +200,11 @@ declare <vscale x 16 x i32> @llvm.riscv.vfwcvt.xu.f.v.nxv16i32.nxv16f16(
 define <vscale x 16 x i32> @intrinsic_vfwcvt_xu.f.v_nxv16i32_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv16i32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.xu.f.v v16, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwcvt.xu.f.v v8, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vfwcvt.xu.f.v.nxv16i32.nxv16f16(
@@ -294,11 +294,11 @@ declare <vscale x 2 x i64> @llvm.riscv.vfwcvt.xu.f.v.nxv2i64.nxv2f32(
 define <vscale x 2 x i64> @intrinsic_vfwcvt_xu.f.v_nxv2i64_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv2i64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.xu.f.v v10, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwcvt.xu.f.v v8, v10
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vfwcvt.xu.f.v.nxv2i64.nxv2f32(
@@ -341,11 +341,11 @@ declare <vscale x 4 x i64> @llvm.riscv.vfwcvt.xu.f.v.nxv4i64.nxv4f32(
 define <vscale x 4 x i64> @intrinsic_vfwcvt_xu.f.v_nxv4i64_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv4i64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.xu.f.v v12, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwcvt.xu.f.v v8, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vfwcvt.xu.f.v.nxv4i64.nxv4f32(
@@ -388,11 +388,11 @@ declare <vscale x 8 x i64> @llvm.riscv.vfwcvt.xu.f.v.nxv8i64.nxv8f32(
 define <vscale x 8 x i64> @intrinsic_vfwcvt_xu.f.v_nxv8i64_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv8i64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.xu.f.v v16, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwcvt.xu.f.v v8, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vfwcvt.xu.f.v.nxv8i64.nxv8f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll
index 1cc2b21f87a4c..d232f83d07c10 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll
@@ -101,8 +101,8 @@ define <vscale x 4 x float> @intrinsic_vfwcvtbf16_f.f.v_nxv4f32_nxv4bf16(<vscale
 ; CHECK-LABEL: intrinsic_vfwcvtbf16_f.f.v_nxv4f32_nxv4bf16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwcvtbf16.f.f.v.nxv4f32.nxv4bf16(
@@ -145,8 +145,8 @@ define <vscale x 8 x float> @intrinsic_vfwcvtbf16_f.f.v_nxv8f32_nxv8bf16(<vscale
 ; CHECK-LABEL: intrinsic_vfwcvtbf16_f.f.v_nxv8f32_nxv8bf16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwcvtbf16.f.f.v.nxv8f32.nxv8bf16(
@@ -189,8 +189,8 @@ define <vscale x 16 x float> @intrinsic_vfwcvtbf16_f.f.v_nxv16f32_nxv16bf16(<vsc
 ; CHECK-LABEL: intrinsic_vfwcvtbf16_f.f.v_nxv16f32_nxv16bf16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwcvtbf16.f.f.v.nxv16f32.nxv16bf16(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll
index b1e36634e481a..4ef7ea5b52a75 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll
@@ -414,11 +414,11 @@ define <vscale x 4 x float> @vfmacc_vv_nxv4f32(<vscale x 4 x half> %a, <vscale x
 ; ZVFHMIN-LABEL: vfmacc_vv_nxv4f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v14, v10, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x i1> %m, i32 %evl)
   %bext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
@@ -437,11 +437,11 @@ define <vscale x 4 x float> @vfmacc_vv_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vfmacc_vv_nxv4f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v14, v10
-; ZVFHMIN-NEXT:    vmv.v.v v8, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v10
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x i1> splat (i1 -1), i32 %evl)
   %bext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 -1), i32 %evl)
@@ -461,11 +461,11 @@ define <vscale x 4 x float> @vfmacc_vf_nxv4f32(<vscale x 4 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -487,11 +487,11 @@ define <vscale x 4 x float> @vfmacc_vf_nxv4f32_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -518,11 +518,11 @@ define <vscale x 8 x float> @vfmacc_vv_nxv8f32(<vscale x 8 x half> %a, <vscale x
 ; ZVFHMIN-LABEL: vfmacc_vv_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv2r.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v20, v12, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v20, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x i1> %m, i32 %evl)
   %bext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
@@ -541,11 +541,11 @@ define <vscale x 8 x float> @vfmacc_vv_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vfmacc_vv_nxv8f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv2r.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v20, v12
-; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v20, v12
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x i1> splat (i1 -1), i32 %evl)
   %bext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 -1), i32 %evl)
@@ -565,11 +565,11 @@ define <vscale x 8 x float> @vfmacc_vf_nxv8f32(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v20, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -591,11 +591,11 @@ define <vscale x 8 x float> @vfmacc_vf_nxv8f32_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v20, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -623,12 +623,10 @@ define <vscale x 16 x float> @vfmacc_vv_nxv16f32(<vscale x 16 x half> %a, <vscal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vmv4r.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x i1> %m, i32 %evl)
   %bext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
@@ -647,11 +645,11 @@ define <vscale x 16 x float> @vfmacc_vv_nxv16f32_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN-LABEL: vfmacc_vv_nxv16f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv4r.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v16
-; ZVFHMIN-NEXT:    vmv.v.v v8, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v0, v16
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x i1> splat (i1 -1), i32 %evl)
   %bext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 -1), i32 %evl)
@@ -697,11 +695,11 @@ define <vscale x 16 x float> @vfmacc_vf_nxv16f32_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v0, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll
index 94b80075ac14c..36f8e99b27383 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll
@@ -338,11 +338,11 @@ define <vscale x 4 x float> @vmfsac_vv_nxv4f32(<vscale x 4 x half> %a, <vscale x
 ; ZVFHMIN-LABEL: vmfsac_vv_nxv4f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmsub.vv v12, v14, v10, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v12
+; ZVFHMIN-NEXT:    vfmsub.vv v8, v14, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x i1> %m, i32 %evl)
   %bext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
@@ -362,11 +362,11 @@ define <vscale x 4 x float> @vmfsac_vv_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vmfsac_vv_nxv4f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmsub.vv v12, v14, v10
-; ZVFHMIN-NEXT:    vmv.v.v v8, v12
+; ZVFHMIN-NEXT:    vfmsub.vv v8, v14, v10
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x i1> splat (i1 -1), i32 %evl)
   %bext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 -1), i32 %evl)
@@ -387,11 +387,11 @@ define <vscale x 4 x float> @vmfsac_vf_nxv4f32(<vscale x 4 x half> %a, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmsub.vv v8, v14, v10, v0.t
+; ZVFHMIN-NEXT:    vfmsub.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -412,14 +412,14 @@ define <vscale x 4 x float> @vmfsac_vf_nxv4f32_commute(<vscale x 4 x half> %a, h
 ;
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv4f32_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v12, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v14, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmsub.vv v12, v14, v10, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v12
+; ZVFHMIN-NEXT:    vfmsub.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -442,11 +442,11 @@ define <vscale x 4 x float> @vmfsac_vf_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmsub.vv v8, v14, v10
+; ZVFHMIN-NEXT:    vfmsub.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -472,11 +472,11 @@ define <vscale x 8 x float> @vmfsac_vv_nxv8f32(<vscale x 8 x half> %a, <vscale x
 ; ZVFHMIN-LABEL: vmfsac_vv_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv2r.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmsub.vv v16, v20, v12, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vfmsub.vv v8, v20, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x i1> %m, i32 %evl)
   %bext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
@@ -496,11 +496,11 @@ define <vscale x 8 x float> @vmfsac_vv_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vmfsac_vv_nxv8f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv2r.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmsub.vv v16, v20, v12
-; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vfmsub.vv v8, v20, v12
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x i1> splat (i1 -1), i32 %evl)
   %bext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 -1), i32 %evl)
@@ -521,11 +521,11 @@ define <vscale x 8 x float> @vmfsac_vf_nxv8f32(<vscale x 8 x half> %a, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmsub.vv v8, v20, v12, v0.t
+; ZVFHMIN-NEXT:    vfmsub.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -546,14 +546,14 @@ define <vscale x 8 x float> @vmfsac_vf_nxv8f32_commute(<vscale x 8 x half> %a, h
 ;
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv8f32_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
+; ZVFHMIN-NEXT:    vmv2r.v v16, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v20, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmsub.vv v16, v20, v12, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vfmsub.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -576,11 +576,11 @@ define <vscale x 8 x float> @vmfsac_vf_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmsub.vv v8, v20, v12
+; ZVFHMIN-NEXT:    vfmsub.vv v8, v16, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll
index 8cc8c5cffca6b..41f2c26c15808 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll
@@ -51,8 +51,9 @@ define <vscale x 2 x double> @vfwmul_vv_nxv2f64(<vscale x 2 x float> %va, <vscal
 ; CHECK-LABEL: vfwmul_vv_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwmul.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vfwmul.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %vc = fpext <vscale x 2 x float> %va to <vscale x 2 x double>
   %vd = fpext <vscale x 2 x float> %vb to <vscale x 2 x double>
@@ -64,8 +65,8 @@ define <vscale x 2 x double> @vfwmul_vf_nxv2f64(<vscale x 2 x float> %va, float
 ; CHECK-LABEL: vfwmul_vf_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwmul.vf v10, v8, fa0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwmul.vf v8, v10, fa0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 2 x float> %head, <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer
@@ -79,8 +80,8 @@ define <vscale x 2 x double> @vfwmul_vf_nxv2f64_2(<vscale x 2 x float> %va, floa
 ; CHECK-LABEL: vfwmul_vf_nxv2f64_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwmul.vf v10, v8, fa0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwmul.vf v8, v10, fa0
 ; CHECK-NEXT:    ret
   %fpext = fpext float %b to double
   %head = insertelement <vscale x 2 x double> poison, double %fpext, i32 0
@@ -94,8 +95,9 @@ define <vscale x 4 x double> @vfwmul_vv_nxv4f64(<vscale x 4 x float> %va, <vscal
 ; CHECK-LABEL: vfwmul_vv_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwmul.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vfwmul.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %vc = fpext <vscale x 4 x float> %va to <vscale x 4 x double>
   %vd = fpext <vscale x 4 x float> %vb to <vscale x 4 x double>
@@ -107,8 +109,8 @@ define <vscale x 4 x double> @vfwmul_vf_nxv4f64(<vscale x 4 x float> %va, float
 ; CHECK-LABEL: vfwmul_vf_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwmul.vf v12, v8, fa0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwmul.vf v8, v12, fa0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -122,8 +124,8 @@ define <vscale x 4 x double> @vfwmul_vf_nxv4f64_2(<vscale x 4 x float> %va, floa
 ; CHECK-LABEL: vfwmul_vf_nxv4f64_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwmul.vf v12, v8, fa0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwmul.vf v8, v12, fa0
 ; CHECK-NEXT:    ret
   %fpext = fpext float %b to double
   %head = insertelement <vscale x 4 x double> poison, double %fpext, i32 0
@@ -137,8 +139,9 @@ define <vscale x 8 x double> @vfwmul_vv_nxv8f64(<vscale x 8 x float> %va, <vscal
 ; CHECK-LABEL: vfwmul_vv_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwmul.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vfwmul.vv v8, v20, v16
 ; CHECK-NEXT:    ret
   %vc = fpext <vscale x 8 x float> %va to <vscale x 8 x double>
   %vd = fpext <vscale x 8 x float> %vb to <vscale x 8 x double>
@@ -150,8 +153,8 @@ define <vscale x 8 x double> @vfwmul_vf_nxv8f64(<vscale x 8 x float> %va, float
 ; CHECK-LABEL: vfwmul_vf_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwmul.vf v16, v8, fa0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwmul.vf v8, v16, fa0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -165,8 +168,8 @@ define <vscale x 8 x double> @vfwmul_vf_nxv8f64_2(<vscale x 8 x float> %va, floa
 ; CHECK-LABEL: vfwmul_vf_nxv8f64_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwmul.vf v16, v8, fa0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwmul.vf v8, v16, fa0
 ; CHECK-NEXT:    ret
   %fpext = fpext float %b to double
   %head = insertelement <vscale x 8 x double> poison, double %fpext, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmul.ll
index dae29efc75bf8..d16c418cbf770 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmul.ll
@@ -115,11 +115,12 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4f16.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwmul_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwmul.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vv v8, v11, v10
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4f16.nxv4f16(
@@ -166,11 +167,12 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8f16.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwmul_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwmul.vv v12, v8, v10
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vv v8, v14, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8f16.nxv8f16(
@@ -217,11 +219,12 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16f16.nxv16f16(
 define <vscale x 16 x float> @intrinsic_vfwmul_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwmul.vv v16, v8, v12
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vv v8, v20, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16f16.nxv16f16(
@@ -319,11 +322,12 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmul.nxv2f64.nxv2f32.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwmul_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwmul.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vv v8, v11, v10
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwmul.nxv2f64.nxv2f32.nxv2f32(
@@ -370,11 +374,12 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmul.nxv4f64.nxv4f32.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwmul_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwmul.vv v12, v8, v10
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vv v8, v14, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwmul.nxv4f64.nxv4f32.nxv4f32(
@@ -421,11 +426,12 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmul.nxv8f64.nxv8f32.nxv8f32(
 define <vscale x 8 x double> @intrinsic_vfwmul_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwmul.vv v16, v8, v12
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vv v8, v20, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwmul.nxv8f64.nxv8f32.nxv8f32(
@@ -574,11 +580,11 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4f16.f16(
 define <vscale x 4 x float> @intrinsic_vfwmul_vf_nxv4f32_nxv4f16_f16(<vscale x 4 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv4f32_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwmul.vf v10, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vf v8, v10, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4f16.f16(
@@ -625,11 +631,11 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8f16.f16(
 define <vscale x 8 x float> @intrinsic_vfwmul_vf_nxv8f32_nxv8f16_f16(<vscale x 8 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv8f32_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwmul.vf v12, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vf v8, v12, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8f16.f16(
@@ -676,11 +682,11 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16f16.f16(
 define <vscale x 16 x float> @intrinsic_vfwmul_vf_nxv16f32_nxv16f16_f16(<vscale x 16 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv16f32_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwmul.vf v16, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vf v8, v16, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16f16.f16(
@@ -778,11 +784,11 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmul.nxv2f64.nxv2f32.f32(
 define <vscale x 2 x double> @intrinsic_vfwmul_vf_nxv2f64_nxv2f32_f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv2f64_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwmul.vf v10, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vf v8, v10, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwmul.nxv2f64.nxv2f32.f32(
@@ -829,11 +835,11 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmul.nxv4f64.nxv4f32.f32(
 define <vscale x 4 x double> @intrinsic_vfwmul_vf_nxv4f64_nxv4f32_f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv4f64_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwmul.vf v12, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vf v8, v12, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwmul.nxv4f64.nxv4f32.f32(
@@ -880,11 +886,11 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmul.nxv8f64.nxv8f32.f32(
 define <vscale x 8 x double> @intrinsic_vfwmul_vf_nxv8f64_nxv8f32_f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv8f64_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwmul.vf v16, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vf v8, v16, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwmul.nxv8f64.nxv8f32.f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
index e447babb03723..fa328356ab585 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
@@ -299,11 +299,11 @@ define <vscale x 4 x float> @vfnmacc_vv_nxv4f32(<vscale x 4 x half> %a, <vscale
 ; ZVFHMIN-LABEL: vfnmacc_vv_nxv4f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v12, v14, v10, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v12
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v14, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x i1> %m, i32 %evl)
   %bext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
@@ -324,11 +324,11 @@ define <vscale x 4 x float> @vfnmacc_vv_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vv_nxv4f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v12, v14, v10
-; ZVFHMIN-NEXT:    vmv.v.v v8, v12
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v14, v10
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x i1> splat (i1 -1), i32 %evl)
   %bext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 -1), i32 %evl)
@@ -350,11 +350,11 @@ define <vscale x 4 x float> @vfnmacc_vf_nxv4f32(<vscale x 4 x half> %a, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v8, v14, v10, v0.t
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -377,14 +377,14 @@ define <vscale x 4 x float> @vfnmacc_vf_nxv4f32_commute(<vscale x 4 x half> %a,
 ;
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv4f32_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v12, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v14, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v12, v14, v10, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v12
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -408,11 +408,11 @@ define <vscale x 4 x float> @vfnmacc_vf_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v8, v14, v10
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -439,11 +439,11 @@ define <vscale x 8 x float> @vfnmacc_vv_nxv8f32(<vscale x 8 x half> %a, <vscale
 ; ZVFHMIN-LABEL: vfnmacc_vv_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv2r.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v16, v20, v12, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v20, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x i1> %m, i32 %evl)
   %bext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
@@ -464,11 +464,11 @@ define <vscale x 8 x float> @vfnmacc_vv_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vv_nxv8f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv2r.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v16, v20, v12
-; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v20, v12
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x i1> splat (i1 -1), i32 %evl)
   %bext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 -1), i32 %evl)
@@ -490,11 +490,11 @@ define <vscale x 8 x float> @vfnmacc_vf_nxv8f32(<vscale x 8 x half> %a, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v8, v20, v12, v0.t
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -517,14 +517,14 @@ define <vscale x 8 x float> @vfnmacc_vf_nxv8f32_commute(<vscale x 8 x half> %a,
 ;
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv8f32_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
+; ZVFHMIN-NEXT:    vmv2r.v v16, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v20, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v16, v20, v12, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -548,11 +548,11 @@ define <vscale x 8 x float> @vfnmacc_vf_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v8, v20, v12
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v16, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -580,12 +580,10 @@ define <vscale x 16 x float> @vfnmacc_vv_nxv16f32(<vscale x 16 x half> %a, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vmv4r.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v24
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x i1> %m, i32 %evl)
   %bext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
@@ -606,11 +604,11 @@ define <vscale x 16 x float> @vfnmacc_vv_nxv16f32_unmasked(<vscale x 16 x half>
 ; ZVFHMIN-LABEL: vfnmacc_vv_nxv16f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv4r.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v24, v0, v16
-; ZVFHMIN-NEXT:    vmv.v.v v8, v24
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v0, v16
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x i1> splat (i1 -1), i32 %evl)
   %bext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 -1), i32 %evl)
@@ -659,14 +657,14 @@ define <vscale x 16 x float> @vfnmacc_vf_nxv16f32_commute(<vscale x 16 x half> %
 ;
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv16f32_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
+; ZVFHMIN-NEXT:    vmv4r.v v24, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v4, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v24
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -690,11 +688,11 @@ define <vscale x 16 x float> @vfnmacc_vf_nxv16f32_unmasked(<vscale x 16 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v8, v0, v16
+; ZVFHMIN-NEXT:    vfnmadd.vv v8, v24, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
index ea705b2732edf..cc0ae35780a60 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
@@ -287,11 +287,11 @@ define <vscale x 4 x float> @vfnmsac_vv_nxv4f32(<vscale x 4 x half> %a, <vscale
 ; ZVFHMIN-LABEL: vfnmsac_vv_nxv4f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v12, v14, v10, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v12
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v14, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x i1> %m, i32 %evl)
   %bext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
@@ -311,11 +311,11 @@ define <vscale x 4 x float> @vfnmsac_vv_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vv_nxv4f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v12, v14, v10
-; ZVFHMIN-NEXT:    vmv.v.v v8, v12
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v14, v10
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x i1> splat (i1 -1), i32 %evl)
   %bext = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 -1), i32 %evl)
@@ -336,11 +336,11 @@ define <vscale x 4 x float> @vfnmsac_vf_nxv4f32(<vscale x 4 x half> %a, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v8, v14, v10, v0.t
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -361,14 +361,14 @@ define <vscale x 4 x float> @vfnmsac_vf_nxv4f32_commute(<vscale x 4 x half> %a,
 ;
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv4f32_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v12, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v14, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v12, v14, v10, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v12
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -391,11 +391,11 @@ define <vscale x 4 x float> @vfnmsac_vf_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vmv.v.x v14, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v8, v14, v10
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -421,11 +421,11 @@ define <vscale x 8 x float> @vfnmsac_vv_nxv8f32(<vscale x 8 x half> %a, <vscale
 ; ZVFHMIN-LABEL: vfnmsac_vv_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv2r.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v16, v20, v12, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v20, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x i1> %m, i32 %evl)
   %bext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
@@ -445,11 +445,11 @@ define <vscale x 8 x float> @vfnmsac_vv_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vv_nxv8f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv2r.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v16, v20, v12
-; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v20, v12
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x i1> splat (i1 -1), i32 %evl)
   %bext = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 -1), i32 %evl)
@@ -470,11 +470,11 @@ define <vscale x 8 x float> @vfnmsac_vf_nxv8f32(<vscale x 8 x half> %a, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v8, v20, v12, v0.t
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -495,14 +495,14 @@ define <vscale x 8 x float> @vfnmsac_vf_nxv8f32_commute(<vscale x 8 x half> %a,
 ;
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv8f32_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
+; ZVFHMIN-NEXT:    vmv2r.v v16, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v20, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v16, v20, v12, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v16
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -525,11 +525,11 @@ define <vscale x 8 x float> @vfnmsac_vf_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vmv.v.x v20, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v8, v20, v12
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v16, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -556,12 +556,10 @@ define <vscale x 16 x float> @vfnmsac_vv_nxv16f32(<vscale x 16 x half> %a, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vmv4r.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v24
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x i1> %m, i32 %evl)
   %bext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
@@ -581,11 +579,11 @@ define <vscale x 16 x float> @vfnmsac_vv_nxv16f32_unmasked(<vscale x 16 x half>
 ; ZVFHMIN-LABEL: vfnmsac_vv_nxv16f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv4r.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v24, v0, v16
-; ZVFHMIN-NEXT:    vmv.v.v v8, v24
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v0, v16
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x i1> splat (i1 -1), i32 %evl)
   %bext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 -1), i32 %evl)
@@ -631,14 +629,14 @@ define <vscale x 16 x float> @vfnmsac_vf_nxv16f32_commute(<vscale x 16 x half> %
 ;
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv16f32_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
+; ZVFHMIN-NEXT:    vmv4r.v v24, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v4, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v8, v24
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -661,11 +659,11 @@ define <vscale x 16 x float> @vfnmsac_vf_nxv16f32_unmasked(<vscale x 16 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v8, v0, v16
+; ZVFHMIN-NEXT:    vfnmsub.vv v8, v24, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll
index d0cb64d986661..9393cbd4ccafc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll
@@ -88,8 +88,9 @@ define <vscale x 2 x double> @vfwsub_vv_nxv2f64(<vscale x 2 x float> %va, <vscal
 ; CHECK-LABEL: vfwsub_vv_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwsub.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vfwsub.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %vc = fpext <vscale x 2 x float> %va to <vscale x 2 x double>
   %vd = fpext <vscale x 2 x float> %vb to <vscale x 2 x double>
@@ -101,8 +102,8 @@ define <vscale x 2 x double> @vfwsub_vf_nxv2f64(<vscale x 2 x float> %va, float
 ; CHECK-LABEL: vfwsub_vf_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwsub.vf v10, v8, fa0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwsub.vf v8, v10, fa0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 2 x float> %head, <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer
@@ -116,8 +117,8 @@ define <vscale x 2 x double> @vfwsub_vf_nxv2f64_2(<vscale x 2 x float> %va, floa
 ; CHECK-LABEL: vfwsub_vf_nxv2f64_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwsub.vf v10, v8, fa0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwsub.vf v8, v10, fa0
 ; CHECK-NEXT:    ret
   %fpext = fpext float %b to double
   %head = insertelement <vscale x 2 x double> poison, double %fpext, i32 0
@@ -168,8 +169,9 @@ define <vscale x 4 x double> @vfwsub_vv_nxv4f64(<vscale x 4 x float> %va, <vscal
 ; CHECK-LABEL: vfwsub_vv_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwsub.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vfwsub.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %vc = fpext <vscale x 4 x float> %va to <vscale x 4 x double>
   %vd = fpext <vscale x 4 x float> %vb to <vscale x 4 x double>
@@ -181,8 +183,8 @@ define <vscale x 4 x double> @vfwsub_vf_nxv4f64(<vscale x 4 x float> %va, float
 ; CHECK-LABEL: vfwsub_vf_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwsub.vf v12, v8, fa0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwsub.vf v8, v12, fa0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -196,8 +198,8 @@ define <vscale x 4 x double> @vfwsub_vf_nxv4f64_2(<vscale x 4 x float> %va, floa
 ; CHECK-LABEL: vfwsub_vf_nxv4f64_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwsub.vf v12, v8, fa0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwsub.vf v8, v12, fa0
 ; CHECK-NEXT:    ret
   %fpext = fpext float %b to double
   %head = insertelement <vscale x 4 x double> poison, double %fpext, i32 0
@@ -248,8 +250,9 @@ define <vscale x 8 x double> @vfwsub_vv_nxv8f64(<vscale x 8 x float> %va, <vscal
 ; CHECK-LABEL: vfwsub_vv_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwsub.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vfwsub.vv v8, v20, v16
 ; CHECK-NEXT:    ret
   %vc = fpext <vscale x 8 x float> %va to <vscale x 8 x double>
   %vd = fpext <vscale x 8 x float> %vb to <vscale x 8 x double>
@@ -261,8 +264,8 @@ define <vscale x 8 x double> @vfwsub_vf_nxv8f64(<vscale x 8 x float> %va, float
 ; CHECK-LABEL: vfwsub_vf_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwsub.vf v16, v8, fa0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwsub.vf v8, v16, fa0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -276,8 +279,8 @@ define <vscale x 8 x double> @vfwsub_vf_nxv8f64_2(<vscale x 8 x float> %va, floa
 ; CHECK-LABEL: vfwsub_vf_nxv8f64_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwsub.vf v16, v8, fa0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwsub.vf v8, v16, fa0
 ; CHECK-NEXT:    ret
   %fpext = fpext float %b to double
   %head = insertelement <vscale x 8 x double> poison, double %fpext, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub.ll
index 4f263c63e545b..703a69d048311 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub.ll
@@ -115,11 +115,12 @@ declare <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4f16.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwsub_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwsub.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vv v8, v11, v10
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4f16.nxv4f16(
@@ -166,11 +167,12 @@ declare <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8f16.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwsub_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwsub.vv v12, v8, v10
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vv v8, v14, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8f16.nxv8f16(
@@ -217,11 +219,12 @@ declare <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16f16.nxv16f16(
 define <vscale x 16 x float> @intrinsic_vfwsub_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwsub.vv v16, v8, v12
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vv v8, v20, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16f16.nxv16f16(
@@ -319,11 +322,12 @@ declare <vscale x 2 x double> @llvm.riscv.vfwsub.nxv2f64.nxv2f32.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwsub_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwsub.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vv v8, v11, v10
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwsub.nxv2f64.nxv2f32.nxv2f32(
@@ -370,11 +374,12 @@ declare <vscale x 4 x double> @llvm.riscv.vfwsub.nxv4f64.nxv4f32.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwsub_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwsub.vv v12, v8, v10
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vv v8, v14, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwsub.nxv4f64.nxv4f32.nxv4f32(
@@ -421,11 +426,12 @@ declare <vscale x 8 x double> @llvm.riscv.vfwsub.nxv8f64.nxv8f32.nxv8f32(
 define <vscale x 8 x double> @intrinsic_vfwsub_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwsub.vv v16, v8, v12
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vv v8, v20, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwsub.nxv8f64.nxv8f32.nxv8f32(
@@ -574,11 +580,11 @@ declare <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4f16.f16(
 define <vscale x 4 x float> @intrinsic_vfwsub_vf_nxv4f32_nxv4f16_f16(<vscale x 4 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv4f32_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwsub.vf v10, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vf v8, v10, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4f16.f16(
@@ -625,11 +631,11 @@ declare <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8f16.f16(
 define <vscale x 8 x float> @intrinsic_vfwsub_vf_nxv8f32_nxv8f16_f16(<vscale x 8 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv8f32_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwsub.vf v12, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vf v8, v12, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8f16.f16(
@@ -676,11 +682,11 @@ declare <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16f16.f16(
 define <vscale x 16 x float> @intrinsic_vfwsub_vf_nxv16f32_nxv16f16_f16(<vscale x 16 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv16f32_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwsub.vf v16, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vf v8, v16, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16f16.f16(
@@ -778,11 +784,11 @@ declare <vscale x 2 x double> @llvm.riscv.vfwsub.nxv2f64.nxv2f32.f32(
 define <vscale x 2 x double> @intrinsic_vfwsub_vf_nxv2f64_nxv2f32_f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv2f64_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwsub.vf v10, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vf v8, v10, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwsub.nxv2f64.nxv2f32.f32(
@@ -829,11 +835,11 @@ declare <vscale x 4 x double> @llvm.riscv.vfwsub.nxv4f64.nxv4f32.f32(
 define <vscale x 4 x double> @intrinsic_vfwsub_vf_nxv4f64_nxv4f32_f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv4f64_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwsub.vf v12, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vf v8, v12, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwsub.nxv4f64.nxv4f32.f32(
@@ -880,11 +886,11 @@ declare <vscale x 8 x double> @llvm.riscv.vfwsub.nxv8f64.nxv8f32.f32(
 define <vscale x 8 x double> @intrinsic_vfwsub_vf_nxv8f64_nxv8f32_f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv8f64_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwsub.vf v16, v8, fa0
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vf v8, v16, fa0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwsub.nxv8f64.nxv8f32.f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll
index fdb48a6f10d3a..ab45345a7a39d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll
@@ -1289,11 +1289,11 @@ entry:
 define <vscale x 4 x float> @intrinsic_vfwsub.w_wv_untie_nxv4f32_nxv4f32_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv4f32_nxv4f32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwsub.wv v12, v10, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.wv v8, v10, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4f16(
@@ -1308,11 +1308,11 @@ entry:
 define <vscale x 8 x float> @intrinsic_vfwsub.w_wv_untie_nxv8f32_nxv8f32_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv8f32_nxv8f32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwsub.wv v16, v12, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.wv v8, v12, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8f16(
@@ -1346,11 +1346,11 @@ entry:
 define <vscale x 2 x double> @intrinsic_vfwsub.w_wv_untie_nxv2f64_nxv2f64_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv2f64_nxv2f64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwsub.wv v12, v10, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.wv v8, v10, v12
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwsub.w.nxv2f64.nxv2f32(
@@ -1365,11 +1365,11 @@ entry:
 define <vscale x 4 x double> @intrinsic_vfwsub.w_wv_untie_nxv4f64_nxv4f64_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv4f64_nxv4f64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwsub.wv v16, v12, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.wv v8, v12, v16
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwsub.w.nxv4f64.nxv4f32(
@@ -1384,11 +1384,11 @@ entry:
 define <vscale x 8 x double> @intrinsic_vfwsub.w_wv_untie_nxv8f64_nxv8f64_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv8f64_nxv8f64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwsub.wv v24, v16, v8
-; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.wv v8, v16, v24
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwsub.w.nxv8f64.nxv8f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vitofp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vitofp-constrained-sdnode.ll
index f25a27ca2b905..599a66d191fd2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vitofp-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vitofp-constrained-sdnode.ll
@@ -641,8 +641,8 @@ define <vscale x 8 x half> @vsitofp_nxv8i8_nxv8f16(<vscale x 8 x i8> %va) strict
 ; CHECK-LABEL: vsitofp_nxv8i8_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x half> @llvm.experimental.constrained.sitofp.nxv8f16.nxv8i8(<vscale x 8 x i8> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x half> %evec
@@ -653,8 +653,8 @@ define <vscale x 8 x half> @vuitofp_nxv8i8_nxv8f16(<vscale x 8 x i8> %va) strict
 ; CHECK-LABEL: vuitofp_nxv8i8_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x half> @llvm.experimental.constrained.uitofp.nxv8f16.nxv8i8(<vscale x 8 x i8> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x half> %evec
@@ -713,8 +713,8 @@ define <vscale x 16 x half> @vsitofp_nxv16i8_nxv16f16(<vscale x 16 x i8> %va) st
 ; CHECK-LABEL: vsitofp_nxv16i8_nxv16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 16 x half> @llvm.experimental.constrained.sitofp.nxv16f16.nxv16i8(<vscale x 16 x i8> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 16 x half> %evec
@@ -725,8 +725,8 @@ define <vscale x 16 x half> @vuitofp_nxv16i8_nxv16f16(<vscale x 16 x i8> %va) st
 ; CHECK-LABEL: vuitofp_nxv16i8_nxv16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 16 x half> @llvm.experimental.constrained.uitofp.nxv16f16.nxv16i8(<vscale x 16 x i8> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 16 x half> %evec
@@ -761,8 +761,8 @@ define <vscale x 32 x half> @vsitofp_nxv32i8_nxv32f16(<vscale x 32 x i8> %va) st
 ; CHECK-LABEL: vsitofp_nxv32i8_nxv32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 32 x half> @llvm.experimental.constrained.sitofp.nxv32f16.nxv32i8(<vscale x 32 x i8> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 32 x half> %evec
@@ -773,8 +773,8 @@ define <vscale x 32 x half> @vuitofp_nxv32i8_nxv32f16(<vscale x 32 x i8> %va) st
 ; CHECK-LABEL: vuitofp_nxv32i8_nxv32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 32 x half> @llvm.experimental.constrained.uitofp.nxv32f16.nxv32i8(<vscale x 32 x i8> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 32 x half> %evec
@@ -947,8 +947,8 @@ define <vscale x 4 x float> @vsitofp_nxv4i16_nxv4f32(<vscale x 4 x i16> %va) str
 ; CHECK-LABEL: vsitofp_nxv4i16_nxv4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 4 x float> @llvm.experimental.constrained.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x float> %evec
@@ -959,8 +959,8 @@ define <vscale x 4 x float> @vuitofp_nxv4i16_nxv4f32(<vscale x 4 x i16> %va) str
 ; CHECK-LABEL: vuitofp_nxv4i16_nxv4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 4 x float> @llvm.experimental.constrained.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x float> %evec
@@ -1017,8 +1017,8 @@ define <vscale x 8 x float> @vsitofp_nxv8i16_nxv8f32(<vscale x 8 x i16> %va) str
 ; CHECK-LABEL: vsitofp_nxv8i16_nxv8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x float> @llvm.experimental.constrained.sitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x float> %evec
@@ -1029,8 +1029,8 @@ define <vscale x 8 x float> @vuitofp_nxv8i16_nxv8f32(<vscale x 8 x i16> %va) str
 ; CHECK-LABEL: vuitofp_nxv8i16_nxv8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x float> @llvm.experimental.constrained.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x float> %evec
@@ -1087,8 +1087,8 @@ define <vscale x 16 x float> @vsitofp_nxv16i16_nxv16f32(<vscale x 16 x i16> %va)
 ; CHECK-LABEL: vsitofp_nxv16i16_nxv16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 16 x float> @llvm.experimental.constrained.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 16 x float> %evec
@@ -1099,8 +1099,8 @@ define <vscale x 16 x float> @vuitofp_nxv16i16_nxv16f32(<vscale x 16 x i16> %va)
 ; CHECK-LABEL: vuitofp_nxv16i16_nxv16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 16 x float> @llvm.experimental.constrained.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 16 x float> %evec
@@ -1249,8 +1249,8 @@ define <vscale x 2 x double> @vsitofp_nxv2i32_nxv2f64(<vscale x 2 x i32> %va) st
 ; CHECK-LABEL: vsitofp_nxv2i32_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 2 x double> @llvm.experimental.constrained.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 2 x double> %evec
@@ -1261,8 +1261,8 @@ define <vscale x 2 x double> @vuitofp_nxv2i32_nxv2f64(<vscale x 2 x i32> %va) st
 ; CHECK-LABEL: vuitofp_nxv2i32_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 2 x double> @llvm.experimental.constrained.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 2 x double> %evec
@@ -1319,8 +1319,8 @@ define <vscale x 4 x double> @vsitofp_nxv4i32_nxv4f64(<vscale x 4 x i32> %va) st
 ; CHECK-LABEL: vsitofp_nxv4i32_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 4 x double> @llvm.experimental.constrained.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x double> %evec
@@ -1331,8 +1331,8 @@ define <vscale x 4 x double> @vuitofp_nxv4i32_nxv4f64(<vscale x 4 x i32> %va) st
 ; CHECK-LABEL: vuitofp_nxv4i32_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 4 x double> @llvm.experimental.constrained.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x double> %evec
@@ -1389,8 +1389,8 @@ define <vscale x 8 x double> @vsitofp_nxv8i32_nxv8f64(<vscale x 8 x i32> %va) st
 ; CHECK-LABEL: vsitofp_nxv8i32_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x double> @llvm.experimental.constrained.sitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x double> %evec
@@ -1401,8 +1401,8 @@ define <vscale x 8 x double> @vuitofp_nxv8i32_nxv8f64(<vscale x 8 x i32> %va) st
 ; CHECK-LABEL: vuitofp_nxv8i32_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = call <vscale x 8 x double> @llvm.experimental.constrained.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x double> %evec
diff --git a/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll
index 4265663c1feee..95b1c35d48bb3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll
@@ -339,11 +339,11 @@ define <vscale x 32 x bfloat> @vsitofp_nxv32i8_nxv32bf16(<vscale x 32 x i8> %va)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v12, v8
-; CHECK-NEXT:    vsext.vf2 v16, v10
-; CHECK-NEXT:    vfwcvt.f.x.v v24, v12
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfwcvt.f.x.v v24, v16
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vsext.vf2 v24, v10
+; CHECK-NEXT:    vfwcvt.f.x.v v16, v12
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vfwcvt.f.x.v v16, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    ret
   %evec = sitofp <vscale x 32 x i8> %va to <vscale x 32 x bfloat>
   ret <vscale x 32 x bfloat> %evec
@@ -354,11 +354,11 @@ define <vscale x 32 x bfloat> @vuitofp_nxv32i8_nxv32bf16(<vscale x 32 x i8> %va)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vzext.vf2 v16, v10
-; CHECK-NEXT:    vfwcvt.f.xu.v v24, v12
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfwcvt.f.xu.v v24, v16
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vzext.vf2 v24, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    ret
   %evec = uitofp <vscale x 32 x i8> %va to <vscale x 32 x bfloat>
   ret <vscale x 32 x bfloat> %evec
@@ -1496,8 +1496,8 @@ define <vscale x 8 x half> @vsitofp_nxv8i8_nxv8f16(<vscale x 8 x i8> %va) {
 ; ZVFH-LABEL: vsitofp_nxv8i8_nxv8f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; ZVFH-NEXT:    vfwcvt.f.x.v v10, v8
-; ZVFH-NEXT:    vmv2r.v v8, v10
+; ZVFH-NEXT:    vmv1r.v v10, v8
+; ZVFH-NEXT:    vfwcvt.f.x.v v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vsitofp_nxv8i8_nxv8f16:
@@ -1515,8 +1515,8 @@ define <vscale x 8 x half> @vuitofp_nxv8i8_nxv8f16(<vscale x 8 x i8> %va) {
 ; ZVFH-LABEL: vuitofp_nxv8i8_nxv8f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; ZVFH-NEXT:    vfwcvt.f.xu.v v10, v8
-; ZVFH-NEXT:    vmv2r.v v8, v10
+; ZVFH-NEXT:    vmv1r.v v10, v8
+; ZVFH-NEXT:    vfwcvt.f.xu.v v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vuitofp_nxv8i8_nxv8f16:
@@ -1578,8 +1578,8 @@ define <vscale x 16 x half> @vsitofp_nxv16i8_nxv16f16(<vscale x 16 x i8> %va) {
 ; ZVFH-LABEL: vsitofp_nxv16i8_nxv16f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; ZVFH-NEXT:    vfwcvt.f.x.v v12, v8
-; ZVFH-NEXT:    vmv4r.v v8, v12
+; ZVFH-NEXT:    vmv2r.v v12, v8
+; ZVFH-NEXT:    vfwcvt.f.x.v v8, v12
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vsitofp_nxv16i8_nxv16f16:
@@ -1597,8 +1597,8 @@ define <vscale x 16 x half> @vuitofp_nxv16i8_nxv16f16(<vscale x 16 x i8> %va) {
 ; ZVFH-LABEL: vuitofp_nxv16i8_nxv16f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; ZVFH-NEXT:    vfwcvt.f.xu.v v12, v8
-; ZVFH-NEXT:    vmv4r.v v8, v12
+; ZVFH-NEXT:    vmv2r.v v12, v8
+; ZVFH-NEXT:    vfwcvt.f.xu.v v8, v12
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vuitofp_nxv16i8_nxv16f16:
@@ -1638,19 +1638,19 @@ define <vscale x 32 x half> @vsitofp_nxv32i8_nxv32f16(<vscale x 32 x i8> %va) {
 ; ZVFH-LABEL: vsitofp_nxv32i8_nxv32f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFH-NEXT:    vfwcvt.f.x.v v16, v8
-; ZVFH-NEXT:    vmv8r.v v8, v16
+; ZVFH-NEXT:    vmv4r.v v16, v8
+; ZVFH-NEXT:    vfwcvt.f.x.v v8, v16
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vsitofp_nxv32i8_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vsext.vf2 v12, v8
-; ZVFHMIN-NEXT:    vsext.vf2 v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.x.v v24, v12
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.x.v v24, v16
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vsext.vf2 v24, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.x.v v16, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.x.v v16, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    ret
   %evec = sitofp <vscale x 32 x i8> %va to <vscale x 32 x half>
   ret <vscale x 32 x half> %evec
@@ -1660,19 +1660,19 @@ define <vscale x 32 x half> @vuitofp_nxv32i8_nxv32f16(<vscale x 32 x i8> %va) {
 ; ZVFH-LABEL: vuitofp_nxv32i8_nxv32f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFH-NEXT:    vfwcvt.f.xu.v v16, v8
-; ZVFH-NEXT:    vmv8r.v v8, v16
+; ZVFH-NEXT:    vmv4r.v v16, v8
+; ZVFH-NEXT:    vfwcvt.f.xu.v v8, v16
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vuitofp_nxv32i8_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vzext.vf2 v12, v8
-; ZVFHMIN-NEXT:    vzext.vf2 v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.xu.v v24, v12
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.xu.v v24, v16
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vzext.vf2 v24, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.xu.v v16, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.xu.v v16, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    ret
   %evec = uitofp <vscale x 32 x i8> %va to <vscale x 32 x half>
   ret <vscale x 32 x half> %evec
@@ -1872,8 +1872,8 @@ define <vscale x 4 x float> @vsitofp_nxv4i16_nxv4f32(<vscale x 4 x i16> %va) {
 ; CHECK-LABEL: vsitofp_nxv4i16_nxv4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = sitofp <vscale x 4 x i16> %va to <vscale x 4 x float>
   ret <vscale x 4 x float> %evec
@@ -1883,8 +1883,8 @@ define <vscale x 4 x float> @vuitofp_nxv4i16_nxv4f32(<vscale x 4 x i16> %va) {
 ; CHECK-LABEL: vuitofp_nxv4i16_nxv4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = uitofp <vscale x 4 x i16> %va to <vscale x 4 x float>
   ret <vscale x 4 x float> %evec
@@ -1950,8 +1950,8 @@ define <vscale x 8 x float> @vsitofp_nxv8i16_nxv8f32(<vscale x 8 x i16> %va) {
 ; CHECK-LABEL: vsitofp_nxv8i16_nxv8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = sitofp <vscale x 8 x i16> %va to <vscale x 8 x float>
   ret <vscale x 8 x float> %evec
@@ -1961,8 +1961,8 @@ define <vscale x 8 x float> @vuitofp_nxv8i16_nxv8f32(<vscale x 8 x i16> %va) {
 ; CHECK-LABEL: vuitofp_nxv8i16_nxv8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = uitofp <vscale x 8 x i16> %va to <vscale x 8 x float>
   ret <vscale x 8 x float> %evec
@@ -2028,8 +2028,8 @@ define <vscale x 16 x float> @vsitofp_nxv16i16_nxv16f32(<vscale x 16 x i16> %va)
 ; CHECK-LABEL: vsitofp_nxv16i16_nxv16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = sitofp <vscale x 16 x i16> %va to <vscale x 16 x float>
   ret <vscale x 16 x float> %evec
@@ -2039,8 +2039,8 @@ define <vscale x 16 x float> @vuitofp_nxv16i16_nxv16f32(<vscale x 16 x i16> %va)
 ; CHECK-LABEL: vuitofp_nxv16i16_nxv16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = uitofp <vscale x 16 x i16> %va to <vscale x 16 x float>
   ret <vscale x 16 x float> %evec
@@ -2226,8 +2226,8 @@ define <vscale x 2 x double> @vsitofp_nxv2i32_nxv2f64(<vscale x 2 x i32> %va) {
 ; CHECK-LABEL: vsitofp_nxv2i32_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = sitofp <vscale x 2 x i32> %va to <vscale x 2 x double>
   ret <vscale x 2 x double> %evec
@@ -2237,8 +2237,8 @@ define <vscale x 2 x double> @vuitofp_nxv2i32_nxv2f64(<vscale x 2 x i32> %va) {
 ; CHECK-LABEL: vuitofp_nxv2i32_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
   %evec = uitofp <vscale x 2 x i32> %va to <vscale x 2 x double>
   ret <vscale x 2 x double> %evec
@@ -2306,8 +2306,8 @@ define <vscale x 4 x double> @vsitofp_nxv4i32_nxv4f64(<vscale x 4 x i32> %va) {
 ; CHECK-LABEL: vsitofp_nxv4i32_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = sitofp <vscale x 4 x i32> %va to <vscale x 4 x double>
   ret <vscale x 4 x double> %evec
@@ -2317,8 +2317,8 @@ define <vscale x 4 x double> @vuitofp_nxv4i32_nxv4f64(<vscale x 4 x i32> %va) {
 ; CHECK-LABEL: vuitofp_nxv4i32_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
 ; CHECK-NEXT:    ret
   %evec = uitofp <vscale x 4 x i32> %va to <vscale x 4 x double>
   ret <vscale x 4 x double> %evec
@@ -2386,8 +2386,8 @@ define <vscale x 8 x double> @vsitofp_nxv8i32_nxv8f64(<vscale x 8 x i32> %va) {
 ; CHECK-LABEL: vsitofp_nxv8i32_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = sitofp <vscale x 8 x i32> %va to <vscale x 8 x double>
   ret <vscale x 8 x double> %evec
@@ -2397,8 +2397,8 @@ define <vscale x 8 x double> @vuitofp_nxv8i32_nxv8f64(<vscale x 8 x i32> %va) {
 ; CHECK-LABEL: vuitofp_nxv8i32_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
 ; CHECK-NEXT:    ret
   %evec = uitofp <vscale x 8 x i32> %va to <vscale x 8 x double>
   ret <vscale x 8 x double> %evec
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index 5cd9b77af82cf..10a92f0188a93 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -5073,23 +5073,23 @@ define <vscale x 4 x float> @vfwmaccbf16_vf(<vscale x 4 x float> %a, bfloat %b,
 define <vscale x 4 x double> @vfsqrt(<vscale x 4 x float> %a) {
 ; NOVLOPT-LABEL: vfsqrt:
 ; NOVLOPT:       # %bb.0:
-; NOVLOPT-NEXT:    fsrmi a0, 0
 ; NOVLOPT-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
-; NOVLOPT-NEXT:    vfsqrt.v v10, v8
+; NOVLOPT-NEXT:    vmv2r.v v12, v8
+; NOVLOPT-NEXT:    fsrmi a0, 0
+; NOVLOPT-NEXT:    vfsqrt.v v14, v8
 ; NOVLOPT-NEXT:    fsrm a0
 ; NOVLOPT-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; NOVLOPT-NEXT:    vfwmacc.vv v12, v8, v10
-; NOVLOPT-NEXT:    vmv4r.v v8, v12
+; NOVLOPT-NEXT:    vfwmacc.vv v8, v12, v14
 ; NOVLOPT-NEXT:    ret
 ;
 ; VLOPT-LABEL: vfsqrt:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    fsrmi a0, 0
 ; VLOPT-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; VLOPT-NEXT:    vfsqrt.v v10, v8
+; VLOPT-NEXT:    vmv2r.v v12, v8
+; VLOPT-NEXT:    fsrmi a0, 0
+; VLOPT-NEXT:    vfsqrt.v v14, v8
 ; VLOPT-NEXT:    fsrm a0
-; VLOPT-NEXT:    vfwmacc.vv v12, v8, v10
-; VLOPT-NEXT:    vmv4r.v v8, v12
+; VLOPT-NEXT:    vfwmacc.vv v8, v12, v14
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x float> @llvm.riscv.vfsqrt.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, iXLen 0, iXLen 7)
   %2 = call <vscale x 4 x double> @llvm.riscv.vfwmacc(<vscale x 4 x double> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %1, iXLen 7, iXLen 6, iXLen 0)
@@ -5100,18 +5100,18 @@ define <vscale x 4 x double> @vfrsqrt7(<vscale x 4 x float> %a) {
 ; NOVLOPT-LABEL: vfrsqrt7:
 ; NOVLOPT:       # %bb.0:
 ; NOVLOPT-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
-; NOVLOPT-NEXT:    vfrsqrt7.v v10, v8
+; NOVLOPT-NEXT:    vmv2r.v v12, v8
+; NOVLOPT-NEXT:    vfrsqrt7.v v14, v8
 ; NOVLOPT-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; NOVLOPT-NEXT:    vfwmacc.vv v12, v8, v10
-; NOVLOPT-NEXT:    vmv4r.v v8, v12
+; NOVLOPT-NEXT:    vfwmacc.vv v8, v12, v14
 ; NOVLOPT-NEXT:    ret
 ;
 ; VLOPT-LABEL: vfrsqrt7:
 ; VLOPT:       # %bb.0:
 ; VLOPT-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; VLOPT-NEXT:    vfrsqrt7.v v10, v8
-; VLOPT-NEXT:    vfwmacc.vv v12, v8, v10
-; VLOPT-NEXT:    vmv4r.v v8, v12
+; VLOPT-NEXT:    vmv2r.v v12, v8
+; VLOPT-NEXT:    vfrsqrt7.v v14, v8
+; VLOPT-NEXT:    vfwmacc.vv v8, v12, v14
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x float> @llvm.riscv.vfrsqrt7.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, iXLen 7)
   %2 = call <vscale x 4 x double> @llvm.riscv.vfwmacc(<vscale x 4 x double> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %1, iXLen 7, iXLen 6, iXLen 0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vloxei.ll b/llvm/test/CodeGen/RISCV/rvv/vloxei.ll
index 65eedbb0cc898..03b4268975d4c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vloxei.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vloxei.ll
@@ -777,8 +777,8 @@ define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32(
@@ -825,8 +825,8 @@ define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxei32.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32(
@@ -873,8 +873,8 @@ define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vloxei32.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32(
@@ -1444,8 +1444,8 @@ define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32(
@@ -1492,8 +1492,8 @@ define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxei32.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32(
@@ -1540,8 +1540,8 @@ define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vloxei32.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32(
@@ -2254,8 +2254,8 @@ define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16(
@@ -2302,8 +2302,8 @@ define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16(
@@ -2350,8 +2350,8 @@ define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vloxei16.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16(
@@ -2446,8 +2446,8 @@ define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16(
@@ -2494,8 +2494,8 @@ define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16(
@@ -2542,8 +2542,8 @@ define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vloxei16.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16(
@@ -2968,8 +2968,8 @@ define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16(
@@ -3016,8 +3016,8 @@ define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16(
@@ -3064,8 +3064,8 @@ define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16(ptr
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vloxei16.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16(
@@ -3160,8 +3160,8 @@ define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16(
@@ -3208,8 +3208,8 @@ define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16(
@@ -3256,8 +3256,8 @@ define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vloxei16.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16(
@@ -3777,8 +3777,8 @@ define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vs
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8(
@@ -3825,8 +3825,8 @@ define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8(
@@ -3873,8 +3873,8 @@ define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vloxei8.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8(
@@ -4017,8 +4017,8 @@ define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vs
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8(
@@ -4065,8 +4065,8 @@ define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vs
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8(
@@ -4113,8 +4113,8 @@ define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vloxei8.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8(
@@ -4209,8 +4209,8 @@ define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vs
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8(
@@ -4257,8 +4257,8 @@ define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vs
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8(
@@ -4305,8 +4305,8 @@ define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vs
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vloxei8.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8(
@@ -4497,8 +4497,8 @@ define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8(
@@ -4545,8 +4545,8 @@ define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8(
@@ -4593,8 +4593,8 @@ define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vloxei8.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8(
@@ -4977,8 +4977,8 @@ define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8(
@@ -5025,8 +5025,8 @@ define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8(
@@ -5073,8 +5073,8 @@ define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vloxei8.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8(
@@ -5169,8 +5169,8 @@ define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8(
@@ -5217,8 +5217,8 @@ define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8(
@@ -5265,8 +5265,8 @@ define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0,
 ; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vloxei8.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vluxei.ll b/llvm/test/CodeGen/RISCV/rvv/vluxei.ll
index 240f377be1ce3..13c9f3304183e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vluxei.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vluxei.ll
@@ -777,8 +777,8 @@ define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32(
@@ -825,8 +825,8 @@ define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxei32.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32(
@@ -873,8 +873,8 @@ define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vluxei32.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32(
@@ -1684,8 +1684,8 @@ define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32(
@@ -1732,8 +1732,8 @@ define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxei32.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32(
@@ -1780,8 +1780,8 @@ define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vluxei32.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32(
@@ -2494,8 +2494,8 @@ define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16(
@@ -2542,8 +2542,8 @@ define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16(
@@ -2590,8 +2590,8 @@ define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vluxei16.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16(
@@ -2686,8 +2686,8 @@ define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16(
@@ -2734,8 +2734,8 @@ define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16(
@@ -2782,8 +2782,8 @@ define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vluxei16.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16(
@@ -3208,8 +3208,8 @@ define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16(
@@ -3256,8 +3256,8 @@ define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16(
@@ -3304,8 +3304,8 @@ define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16(ptr
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vluxei16.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16(
@@ -3400,8 +3400,8 @@ define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16(
@@ -3448,8 +3448,8 @@ define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16(
@@ -3496,8 +3496,8 @@ define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vluxei16.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16(
@@ -4017,8 +4017,8 @@ define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vs
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8(
@@ -4065,8 +4065,8 @@ define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8(
@@ -4113,8 +4113,8 @@ define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vluxei8.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8(
@@ -4257,8 +4257,8 @@ define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vs
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8(
@@ -4305,8 +4305,8 @@ define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vs
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8(
@@ -4353,8 +4353,8 @@ define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vluxei8.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8(
@@ -4449,8 +4449,8 @@ define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vs
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8(
@@ -4497,8 +4497,8 @@ define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vs
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8(
@@ -4545,8 +4545,8 @@ define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vs
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vluxei8.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8(
@@ -4737,8 +4737,8 @@ define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <v
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8(
@@ -4785,8 +4785,8 @@ define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8(
@@ -4833,8 +4833,8 @@ define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vluxei8.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8(
@@ -4977,8 +4977,8 @@ define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8(
@@ -5025,8 +5025,8 @@ define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8(
@@ -5073,8 +5073,8 @@ define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vluxei8.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8(
@@ -5169,8 +5169,8 @@ define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8(
@@ -5217,8 +5217,8 @@ define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8(
@@ -5265,8 +5265,8 @@ define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0,
 ; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vluxei8.v v16, (a0), v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-inttoptr-ptrtoint.ll b/llvm/test/CodeGen/RISCV/rvv/vp-inttoptr-ptrtoint.ll
index e206444e4ef22..df003907dc360 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-inttoptr-ptrtoint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-inttoptr-ptrtoint.ll
@@ -7,8 +7,8 @@ define <vscale x 4 x ptr> @inttoptr_nxv4p0_nxv4i8(<vscale x 4 x i8> %va, <vscale
 ; CHECK-LABEL: inttoptr_nxv4p0_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vzext.vf8 v12, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vzext.vf8 v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x ptr> @llvm.vp.inttoptr.nxv4p0.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x ptr> %v
@@ -20,8 +20,8 @@ define <vscale x 4 x ptr> @inttoptr_nxv4p0_nxv4i16(<vscale x 4 x i16> %va, <vsca
 ; CHECK-LABEL: inttoptr_nxv4p0_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vzext.vf4 v12, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vzext.vf4 v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x ptr> @llvm.vp.inttoptr.nxv4p0.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x ptr> %v
@@ -33,8 +33,8 @@ define <vscale x 4 x ptr> @inttoptr_nxv4p0_nxv4i32(<vscale x 4 x i32> %va, <vsca
 ; CHECK-LABEL: inttoptr_nxv4p0_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vzext.vf2 v8, v12, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x ptr> @llvm.vp.inttoptr.nxv4p0.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x ptr> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 00d27dd583c8a..d6e1af59e6341 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -575,13 +575,13 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
 ; RV32-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
 ; RV32-NEXT:    vslideup.vx v12, v8, a3
 ; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV32-NEXT:    vwaddu.vv v14, v12, v9
-; RV32-NEXT:    vwmaccu.vx v14, a2, v9
+; RV32-NEXT:    vwaddu.vv v16, v12, v9
+; RV32-NEXT:    vwmaccu.vx v16, a2, v9
 ; RV32-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
-; RV32-NEXT:    vwaddu.vv v16, v14, v10
-; RV32-NEXT:    vwmaccu.vx v16, a2, v10
+; RV32-NEXT:    vwaddu.vv v12, v16, v10
+; RV32-NEXT:    vwmaccu.vx v12, a2, v10
 ; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v16, (a0)
+; RV32-NEXT:    vse32.v v12, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: not_balanced_store_tree:
@@ -599,14 +599,14 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
 ; RV64-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
 ; RV64-NEXT:    vslideup.vx v12, v8, a3
 ; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT:    vwaddu.vv v14, v12, v9
-; RV64-NEXT:    vwmaccu.vx v14, a2, v9
+; RV64-NEXT:    vwaddu.vv v16, v12, v9
+; RV64-NEXT:    vwmaccu.vx v16, a2, v9
 ; RV64-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
-; RV64-NEXT:    vwaddu.vv v16, v14, v10
-; RV64-NEXT:    vwmaccu.vx v16, a2, v10
+; RV64-NEXT:    vwaddu.vv v12, v16, v10
+; RV64-NEXT:    vwmaccu.vx v12, a2, v10
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vse32.v v16, (a0)
+; RV64-NEXT:    vse32.v v12, (a0)
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 4
   %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
index 68e7297605be2..b6ec7906885ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
@@ -936,8 +936,8 @@ define <vscale x 2 x i64> @vpgather_nxv2i64(<vscale x 2 x ptr> %ptrs, <vscale x
 ; RV32-LABEL: vpgather_nxv2i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vluxei32.v v10, (zero), v8, v0.t
-; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    vmv1r.v v10, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_nxv2i64:
@@ -955,8 +955,8 @@ define <vscale x 4 x i64> @vpgather_nxv4i64(<vscale x 4 x ptr> %ptrs, <vscale x
 ; RV32-LABEL: vpgather_nxv4i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vluxei32.v v12, (zero), v8, v0.t
-; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_nxv4i64:
@@ -972,8 +972,8 @@ define <vscale x 4 x i64> @vpgather_truemask_nxv4i64(<vscale x 4 x ptr> %ptrs, i
 ; RV32-LABEL: vpgather_truemask_nxv4i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vluxei32.v v12, (zero), v8
-; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_truemask_nxv4i64:
@@ -991,8 +991,8 @@ define <vscale x 8 x i64> @vpgather_nxv8i64(<vscale x 8 x ptr> %ptrs, <vscale x
 ; RV32-LABEL: vpgather_nxv8i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vluxei32.v v16, (zero), v8, v0.t
-; RV32-NEXT:    vmv.v.v v8, v16
+; RV32-NEXT:    vmv4r.v v16, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_nxv8i64:
@@ -1885,8 +1885,8 @@ define <vscale x 2 x double> @vpgather_nxv2f64(<vscale x 2 x ptr> %ptrs, <vscale
 ; RV32-LABEL: vpgather_nxv2f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vluxei32.v v10, (zero), v8, v0.t
-; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    vmv1r.v v10, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_nxv2f64:
@@ -1904,8 +1904,8 @@ define <vscale x 4 x double> @vpgather_nxv4f64(<vscale x 4 x ptr> %ptrs, <vscale
 ; RV32-LABEL: vpgather_nxv4f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vluxei32.v v12, (zero), v8, v0.t
-; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_nxv4f64:
@@ -1921,8 +1921,8 @@ define <vscale x 4 x double> @vpgather_truemask_nxv4f64(<vscale x 4 x ptr> %ptrs
 ; RV32-LABEL: vpgather_truemask_nxv4f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vluxei32.v v12, (zero), v8
-; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    vmv2r.v v12, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_truemask_nxv4f64:
@@ -1940,8 +1940,8 @@ define <vscale x 6 x double> @vpgather_nxv6f64(<vscale x 6 x ptr> %ptrs, <vscale
 ; RV32-LABEL: vpgather_nxv6f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vluxei32.v v16, (zero), v8, v0.t
-; RV32-NEXT:    vmv.v.v v8, v16
+; RV32-NEXT:    vmv4r.v v16, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_nxv6f64:
@@ -2186,8 +2186,8 @@ define <vscale x 8 x double> @vpgather_nxv8f64(<vscale x 8 x ptr> %ptrs, <vscale
 ; RV32-LABEL: vpgather_nxv8f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vluxei32.v v16, (zero), v8, v0.t
-; RV32-NEXT:    vmv.v.v v8, v16
+; RV32-NEXT:    vmv4r.v v16, v8
+; RV32-NEXT:    vluxei32.v v8, (zero), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_nxv8f64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
index 414c059345498..b9767638cdf82 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
@@ -560,7 +560,6 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -568,22 +567,23 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    sub a4, a3, a1
-; CHECK-NEXT:    vl8r.v v16, (a2)
+; CHECK-NEXT:    vl8r.v v24, (a2)
 ; CHECK-NEXT:    sltu a2, a3, a4
 ; CHECK-NEXT:    vl8r.v v8, (a0)
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a4
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; CHECK-NEXT:    bltu a3, a1, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
index cb925bfb0f237..25a226e60e715 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
@@ -1008,12 +1008,12 @@ define <vscale x 2 x i64> @vrol_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-RV32-NEXT:    vmv.v.x v10, a0
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v12, 0
-; CHECK-RV32-NEXT:    vwsub.vx v14, v12, a0
+; CHECK-RV32-NEXT:    vmv.v.i v14, 0
+; CHECK-RV32-NEXT:    vwsub.vx v12, v14, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
-; CHECK-RV32-NEXT:    vand.vx v12, v14, a0
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
@@ -1071,12 +1071,12 @@ define <vscale x 4 x i64> @vrol_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-RV32-NEXT:    vmv.v.x v12, a0
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v16, 0
-; CHECK-RV32-NEXT:    vwsub.vx v20, v16, a0
+; CHECK-RV32-NEXT:    vmv.v.i v20, 0
+; CHECK-RV32-NEXT:    vwsub.vx v16, v20, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV32-NEXT:    vand.vx v16, v20, a0
+; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-RV32-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
@@ -1134,12 +1134,12 @@ define <vscale x 8 x i64> @vrol_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vmv.v.x v16, a0
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v24, 0
-; CHECK-RV32-NEXT:    vwsub.vx v0, v24, a0
+; CHECK-RV32-NEXT:    vmv.v.i v4, 0
+; CHECK-RV32-NEXT:    vwsub.vx v24, v4, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
-; CHECK-RV32-NEXT:    vand.vx v24, v0, a0
+; CHECK-RV32-NEXT:    vand.vx v24, v24, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v24, v8, v24
 ; CHECK-RV32-NEXT:    vsll.vv v8, v8, v16
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
index 74743f9068a64..9e63b613ab70b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
@@ -1714,12 +1714,12 @@ define <vscale x 2 x i64> @vror_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-RV32-NEXT:    vmv.v.x v10, a0
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v12, 0
-; CHECK-RV32-NEXT:    vwsub.vx v14, v12, a0
+; CHECK-RV32-NEXT:    vmv.v.i v14, 0
+; CHECK-RV32-NEXT:    vwsub.vx v12, v14, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
-; CHECK-RV32-NEXT:    vand.vx v12, v14, a0
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
@@ -1816,12 +1816,12 @@ define <vscale x 4 x i64> @vror_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-RV32-NEXT:    vmv.v.x v12, a0
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v16, 0
-; CHECK-RV32-NEXT:    vwsub.vx v20, v16, a0
+; CHECK-RV32-NEXT:    vmv.v.i v20, 0
+; CHECK-RV32-NEXT:    vwsub.vx v16, v20, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV32-NEXT:    vand.vx v16, v20, a0
+; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
 ; CHECK-RV32-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
@@ -1918,12 +1918,12 @@ define <vscale x 8 x i64> @vror_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vmv.v.x v16, a0
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v24, 0
-; CHECK-RV32-NEXT:    vwsub.vx v0, v24, a0
+; CHECK-RV32-NEXT:    vmv.v.i v4, 0
+; CHECK-RV32-NEXT:    vwsub.vx v24, v4, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
-; CHECK-RV32-NEXT:    vand.vx v24, v0, a0
+; CHECK-RV32-NEXT:    vand.vx v24, v24, a0
 ; CHECK-RV32-NEXT:    vsll.vv v24, v8, v24
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v16
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index d2ef711fc0d74..206838917d004 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -107,32 +107,32 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i16i32_multiple_users(ptr %x, ptr %y
 define <vscale x 2 x i64> @vwop_vscale_sext_i32i64_multiple_users(ptr %x, ptr %y, ptr %z) {
 ; NO_FOLDING-LABEL: vwop_vscale_sext_i32i64_multiple_users:
 ; NO_FOLDING:       # %bb.0:
-; NO_FOLDING-NEXT:    vl1re32.v v8, (a0)
-; NO_FOLDING-NEXT:    vl1re32.v v9, (a1)
-; NO_FOLDING-NEXT:    vl1re32.v v10, (a2)
+; NO_FOLDING-NEXT:    vl1re32.v v10, (a0)
+; NO_FOLDING-NEXT:    vl1re32.v v12, (a1)
+; NO_FOLDING-NEXT:    vl1re32.v v14, (a2)
 ; NO_FOLDING-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; NO_FOLDING-NEXT:    vsext.vf2 v12, v8
-; NO_FOLDING-NEXT:    vsext.vf2 v14, v9
 ; NO_FOLDING-NEXT:    vsext.vf2 v8, v10
-; NO_FOLDING-NEXT:    vmul.vv v10, v12, v14
-; NO_FOLDING-NEXT:    vadd.vv v14, v12, v8
-; NO_FOLDING-NEXT:    vsub.vv v8, v12, v8
+; NO_FOLDING-NEXT:    vsext.vf2 v10, v12
+; NO_FOLDING-NEXT:    vsext.vf2 v12, v14
+; NO_FOLDING-NEXT:    vmul.vv v10, v8, v10
+; NO_FOLDING-NEXT:    vadd.vv v14, v8, v12
+; NO_FOLDING-NEXT:    vsub.vv v8, v8, v12
 ; NO_FOLDING-NEXT:    vor.vv v10, v10, v14
 ; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_sext_i32i64_multiple_users:
 ; FOLDING:       # %bb.0:
-; FOLDING-NEXT:    vl1re32.v v8, (a0)
-; FOLDING-NEXT:    vl1re32.v v9, (a1)
-; FOLDING-NEXT:    vl1re32.v v10, (a2)
+; FOLDING-NEXT:    vl1re32.v v14, (a0)
+; FOLDING-NEXT:    vl1re32.v v10, (a1)
+; FOLDING-NEXT:    vl1re32.v v15, (a2)
 ; FOLDING-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; FOLDING-NEXT:    vwmul.vv v12, v8, v9
-; FOLDING-NEXT:    vwadd.vv v14, v8, v10
-; FOLDING-NEXT:    vwsub.vv v16, v8, v10
+; FOLDING-NEXT:    vwmul.vv v8, v14, v10
+; FOLDING-NEXT:    vwadd.vv v10, v14, v15
+; FOLDING-NEXT:    vwsub.vv v12, v14, v15
 ; FOLDING-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; FOLDING-NEXT:    vor.vv v8, v12, v14
-; FOLDING-NEXT:    vor.vv v8, v8, v16
+; FOLDING-NEXT:    vor.vv v8, v8, v10
+; FOLDING-NEXT:    vor.vv v8, v8, v12
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i32>, ptr %x
   %b = load <vscale x 2 x i32>, ptr %y
@@ -398,32 +398,32 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i16i32_multiple_users(ptr %x, ptr %y
 define <vscale x 2 x i64> @vwop_vscale_zext_i32i64_multiple_users(ptr %x, ptr %y, ptr %z) {
 ; NO_FOLDING-LABEL: vwop_vscale_zext_i32i64_multiple_users:
 ; NO_FOLDING:       # %bb.0:
-; NO_FOLDING-NEXT:    vl1re32.v v8, (a0)
-; NO_FOLDING-NEXT:    vl1re32.v v9, (a1)
-; NO_FOLDING-NEXT:    vl1re32.v v10, (a2)
+; NO_FOLDING-NEXT:    vl1re32.v v10, (a0)
+; NO_FOLDING-NEXT:    vl1re32.v v12, (a1)
+; NO_FOLDING-NEXT:    vl1re32.v v14, (a2)
 ; NO_FOLDING-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; NO_FOLDING-NEXT:    vzext.vf2 v12, v8
-; NO_FOLDING-NEXT:    vzext.vf2 v14, v9
 ; NO_FOLDING-NEXT:    vzext.vf2 v8, v10
-; NO_FOLDING-NEXT:    vmul.vv v10, v12, v14
-; NO_FOLDING-NEXT:    vadd.vv v14, v12, v8
-; NO_FOLDING-NEXT:    vsub.vv v8, v12, v8
+; NO_FOLDING-NEXT:    vzext.vf2 v10, v12
+; NO_FOLDING-NEXT:    vzext.vf2 v12, v14
+; NO_FOLDING-NEXT:    vmul.vv v10, v8, v10
+; NO_FOLDING-NEXT:    vadd.vv v14, v8, v12
+; NO_FOLDING-NEXT:    vsub.vv v8, v8, v12
 ; NO_FOLDING-NEXT:    vor.vv v10, v10, v14
 ; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_zext_i32i64_multiple_users:
 ; FOLDING:       # %bb.0:
-; FOLDING-NEXT:    vl1re32.v v8, (a0)
-; FOLDING-NEXT:    vl1re32.v v9, (a1)
-; FOLDING-NEXT:    vl1re32.v v10, (a2)
+; FOLDING-NEXT:    vl1re32.v v14, (a0)
+; FOLDING-NEXT:    vl1re32.v v10, (a1)
+; FOLDING-NEXT:    vl1re32.v v15, (a2)
 ; FOLDING-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; FOLDING-NEXT:    vwmulu.vv v12, v8, v9
-; FOLDING-NEXT:    vwaddu.vv v14, v8, v10
-; FOLDING-NEXT:    vwsubu.vv v16, v8, v10
+; FOLDING-NEXT:    vwmulu.vv v8, v14, v10
+; FOLDING-NEXT:    vwaddu.vv v10, v14, v15
+; FOLDING-NEXT:    vwsubu.vv v12, v14, v15
 ; FOLDING-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; FOLDING-NEXT:    vor.vv v8, v12, v14
-; FOLDING-NEXT:    vor.vv v8, v8, v16
+; FOLDING-NEXT:    vor.vv v8, v8, v10
+; FOLDING-NEXT:    vor.vv v8, v8, v12
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i32>, ptr %x
   %b = load <vscale x 2 x i32>, ptr %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
index 3c91131fe4d12..eec2a5f3efcfb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
@@ -56,8 +56,8 @@ define <vscale x 2 x i64> @vsext_nxv2i8_nxv2i64(<vscale x 2 x i8> %a, <vscale x
 ; CHECK-LABEL: vsext_nxv2i8_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf8 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf8 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> %m, i32 %vl)
   ret <vscale x 2 x i64> %v
@@ -67,8 +67,8 @@ define <vscale x 2 x i64> @vsext_nxv2i8_nxv2i64_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-LABEL: vsext_nxv2i8_nxv2i64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf8 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf8 v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
@@ -104,8 +104,8 @@ define <vscale x 2 x i64> @vsext_nxv2i16_nxv2i64(<vscale x 2 x i16> %a, <vscale
 ; CHECK-LABEL: vsext_nxv2i16_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf4 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf4 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
   ret <vscale x 2 x i64> %v
@@ -115,8 +115,8 @@ define <vscale x 2 x i64> @vsext_nxv2i16_nxv2i64_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-LABEL: vsext_nxv2i16_nxv2i64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf4 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf4 v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
@@ -128,8 +128,8 @@ define <vscale x 2 x i64> @vsext_nxv2i32_nxv2i64(<vscale x 2 x i32> %a, <vscale
 ; CHECK-LABEL: vsext_nxv2i32_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf2 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> %m, i32 %vl)
   ret <vscale x 2 x i64> %v
@@ -139,8 +139,8 @@ define <vscale x 2 x i64> @vsext_nxv2i32_nxv2i64_unmasked(<vscale x 2 x i32> %a,
 ; CHECK-LABEL: vsext_nxv2i32_nxv2i64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext.ll b/llvm/test/CodeGen/RISCV/rvv/vsext.ll
index ba6783e8a6d39..bb83dbc98401a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsext.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsext.ll
@@ -57,8 +57,8 @@ define <vscale x 2 x i64> @intrinsic_vsext_vf8_nxv2i64(<vscale x 2 x i8> %0, iXL
 ; CHECK-LABEL: intrinsic_vsext_vf8_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf8 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf8 v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsext.nxv2i64.nxv2i8(
@@ -101,8 +101,8 @@ define <vscale x 4 x i64> @intrinsic_vsext_vf8_nxv4i64(<vscale x 4 x i8> %0, iXL
 ; CHECK-LABEL: intrinsic_vsext_vf8_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsext.vf8 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsext.vf8 v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsext.nxv4i64.nxv4i8(
@@ -145,8 +145,8 @@ define <vscale x 8 x i64> @intrinsic_vsext_vf8_nxv8i64(<vscale x 8 x i8> %0, iXL
 ; CHECK-LABEL: intrinsic_vsext_vf8_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsext.vf8 v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsext.nxv8i64.nxv8i8(
@@ -233,8 +233,8 @@ define <vscale x 2 x i64> @intrinsic_vsext_vf4_nxv2i64(<vscale x 2 x i16> %0, iX
 ; CHECK-LABEL: intrinsic_vsext_vf4_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf4 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf4 v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsext.nxv2i64.nxv2i16(
@@ -277,8 +277,8 @@ define <vscale x 4 x i64> @intrinsic_vsext_vf4_nxv4i64(<vscale x 4 x i16> %0, iX
 ; CHECK-LABEL: intrinsic_vsext_vf4_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsext.vf4 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsext.vf4 v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsext.nxv4i64.nxv4i16(
@@ -321,8 +321,8 @@ define <vscale x 8 x i64> @intrinsic_vsext_vf4_nxv8i64(<vscale x 8 x i16> %0, iX
 ; CHECK-LABEL: intrinsic_vsext_vf4_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf4 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsext.vf4 v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsext.nxv8i64.nxv8i16(
@@ -453,8 +453,8 @@ define <vscale x 4 x i32> @intrinsic_vsext_vf4_nxv4i32(<vscale x 4 x i8> %0, iXL
 ; CHECK-LABEL: intrinsic_vsext_vf4_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsext.vf4 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf4 v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vsext.nxv4i32.nxv4i8(
@@ -497,8 +497,8 @@ define <vscale x 8 x i32> @intrinsic_vsext_vf4_nxv8i32(<vscale x 8 x i8> %0, iXL
 ; CHECK-LABEL: intrinsic_vsext_vf4_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsext.vf4 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsext.vf4 v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vsext.nxv8i32.nxv8i8(
@@ -541,8 +541,8 @@ define <vscale x 16 x i32> @intrinsic_vsext_vf4_nxv16i32(<vscale x 16 x i8> %0,
 ; CHECK-LABEL: intrinsic_vsext_vf4_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsext.vf4 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsext.vf4 v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vsext.nxv16i32.nxv16i8(
@@ -629,8 +629,8 @@ define <vscale x 2 x i64> @intrinsic_vsext_vf2_nxv2i64(<vscale x 2 x i32> %0, iX
 ; CHECK-LABEL: intrinsic_vsext_vf2_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsext.nxv2i64.nxv2i32(
@@ -673,8 +673,8 @@ define <vscale x 4 x i64> @intrinsic_vsext_vf2_nxv4i64(<vscale x 4 x i32> %0, iX
 ; CHECK-LABEL: intrinsic_vsext_vf2_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsext.vf2 v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsext.nxv4i64.nxv4i32(
@@ -717,8 +717,8 @@ define <vscale x 8 x i64> @intrinsic_vsext_vf2_nxv8i64(<vscale x 8 x i32> %0, iX
 ; CHECK-LABEL: intrinsic_vsext_vf2_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf2 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsext.vf2 v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsext.nxv8i64.nxv8i32(
@@ -849,8 +849,8 @@ define <vscale x 4 x i32> @intrinsic_vsext_vf2_nxv4i32(<vscale x 4 x i16> %0, iX
 ; CHECK-LABEL: intrinsic_vsext_vf2_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vsext.nxv4i32.nxv4i16(
@@ -893,8 +893,8 @@ define <vscale x 8 x i32> @intrinsic_vsext_vf2_nxv8i32(<vscale x 8 x i16> %0, iX
 ; CHECK-LABEL: intrinsic_vsext_vf2_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsext.vf2 v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vsext.nxv8i32.nxv8i16(
@@ -937,8 +937,8 @@ define <vscale x 16 x i32> @intrinsic_vsext_vf2_nxv16i32(<vscale x 16 x i16> %0,
 ; CHECK-LABEL: intrinsic_vsext_vf2_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsext.vf2 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsext.vf2 v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vsext.nxv16i32.nxv16i16(
@@ -1113,8 +1113,8 @@ define <vscale x 8 x i16> @intrinsic_vsext_vf2_nxv8i16(<vscale x 8 x i8> %0, iXL
 ; CHECK-LABEL: intrinsic_vsext_vf2_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vsext.nxv8i16.nxv8i8(
@@ -1157,8 +1157,8 @@ define <vscale x 16 x i16> @intrinsic_vsext_vf2_nxv16i16(<vscale x 16 x i8> %0,
 ; CHECK-LABEL: intrinsic_vsext_vf2_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsext.vf2 v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vsext.nxv16i16.nxv16i8(
@@ -1201,8 +1201,8 @@ define <vscale x 32 x i16> @intrinsic_vsext_vf2_nxv32i16(<vscale x 32 x i8> %0,
 ; CHECK-LABEL: intrinsic_vsext_vf2_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsext.vf2 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsext.vf2 v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vsext.nxv32i16.nxv32i8(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
index 045003600e64a..f69ae3d560ef7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
@@ -462,8 +462,8 @@ define <vscale x 2 x double> @vsitofp_nxv2f64_nxv2i32(<vscale x 2 x i32> %va, <v
 ; CHECK-LABEL: vsitofp_nxv2f64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x double> %v
@@ -473,8 +473,8 @@ define <vscale x 2 x double> @vsitofp_nxv2f64_nxv2i32_unmasked(<vscale x 2 x i32
 ; CHECK-LABEL: vsitofp_nxv2f64_nxv2i32_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
@@ -508,8 +508,7 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFH-LABEL: vsitofp_nxv32f16_nxv32i32:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; ZVFH-NEXT:    vmv1r.v v7, v0
-; ZVFH-NEXT:    vmv8r.v v24, v16
+; ZVFH-NEXT:    vmv1r.v v24, v0
 ; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    srli a2, a1, 2
 ; ZVFH-NEXT:    slli a1, a1, 1
@@ -519,15 +518,15 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFH-NEXT:    addi a3, a3, -1
 ; ZVFH-NEXT:    and a2, a3, a2
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvt.f.x.w v20, v24, v0.t
+; ZVFH-NEXT:    vfncvt.f.x.w v28, v16, v0.t
 ; ZVFH-NEXT:    bltu a0, a1, .LBB34_2
 ; ZVFH-NEXT:  # %bb.1:
 ; ZVFH-NEXT:    mv a0, a1
 ; ZVFH-NEXT:  .LBB34_2:
-; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vmv1r.v v0, v24
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvt.f.x.w v16, v8, v0.t
-; ZVFH-NEXT:    vmv8r.v v8, v16
+; ZVFH-NEXT:    vfncvt.f.x.w v24, v8, v0.t
+; ZVFH-NEXT:    vmv8r.v v8, v24
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vsitofp_nxv32f16_nxv32i32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
index 3bb56379e4613..6f9aea2d00981 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
@@ -298,7 +298,7 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vl8re64.v v8, (a0)
+; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    srli a5, a1, 3
 ; CHECK-NEXT:    slli a4, a1, 3
 ; CHECK-NEXT:    slli a3, a1, 1
@@ -312,20 +312,20 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    and a4, a7, a4
 ; CHECK-NEXT:    srli a7, a1, 2
-; CHECK-NEXT:    vl8re64.v v24, (a6)
-; CHECK-NEXT:    vslidedown.vx v16, v0, a7
+; CHECK-NEXT:    vl8re64.v v8, (a6)
+; CHECK-NEXT:    vslidedown.vx v24, v0, a7
 ; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a5
-; CHECK-NEXT:    vslidedown.vx v0, v16, a5
+; CHECK-NEXT:    vslidedown.vx v0, v24, a5
 ; CHECK-NEXT:    bltu a0, a1, .LBB17_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB17_2:
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v20, v24, 0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vnsrl.wi v28, v8, 0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB17_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a3
@@ -336,9 +336,9 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v24, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v12, v16, 0, v0.t
 ; CHECK-NEXT:    bltu a2, a1, .LBB17_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a1
@@ -348,9 +348,10 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v24, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
+; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
index c3120668389cd..e1edaaadadf1d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
@@ -454,8 +454,8 @@ define <vscale x 2 x double> @vuitofp_nxv2f64_nxv2i32(<vscale x 2 x i32> %va, <v
 ; CHECK-LABEL: vuitofp_nxv2f64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x double> %v
@@ -465,8 +465,8 @@ define <vscale x 2 x double> @vuitofp_nxv2f64_nxv2i32_unmasked(<vscale x 2 x i32
 ; CHECK-LABEL: vuitofp_nxv2f64_nxv2i32_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
@@ -500,8 +500,7 @@ define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFH-LABEL: vuitofp_nxv32f16_nxv32i32:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; ZVFH-NEXT:    vmv1r.v v7, v0
-; ZVFH-NEXT:    vmv8r.v v24, v16
+; ZVFH-NEXT:    vmv1r.v v24, v0
 ; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    srli a2, a1, 2
 ; ZVFH-NEXT:    slli a1, a1, 1
@@ -511,15 +510,15 @@ define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFH-NEXT:    addi a3, a3, -1
 ; ZVFH-NEXT:    and a2, a3, a2
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvt.f.xu.w v20, v24, v0.t
+; ZVFH-NEXT:    vfncvt.f.xu.w v28, v16, v0.t
 ; ZVFH-NEXT:    bltu a0, a1, .LBB34_2
 ; ZVFH-NEXT:  # %bb.1:
 ; ZVFH-NEXT:    mv a0, a1
 ; ZVFH-NEXT:  .LBB34_2:
-; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vmv1r.v v0, v24
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvt.f.xu.w v16, v8, v0.t
-; ZVFH-NEXT:    vmv8r.v v8, v16
+; ZVFH-NEXT:    vfncvt.f.xu.w v24, v8, v0.t
+; ZVFH-NEXT:    vmv8r.v v8, v24
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vuitofp_nxv32f16_nxv32i32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll
index 02af09f028fc1..747fb5a3e7ff8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll
@@ -5,12 +5,13 @@
 define <vscale x 8 x i64> @vwadd_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x 8 x i64> %y) {
 ; CHECK-LABEL: vwadd_wv_mask_v8i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v24, v8
 ; CHECK-NEXT:    li a0, 42
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, tu, mu
-; CHECK-NEXT:    vwadd.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, tu, mu
+; CHECK-NEXT:    vwadd.wv v8, v8, v24, v0.t
 ; CHECK-NEXT:    ret
     %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
@@ -22,12 +23,13 @@ define <vscale x 8 x i64> @vwadd_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 define <vscale x 8 x i64> @vwaddu_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x 8 x i64> %y) {
 ; CHECK-LABEL: vwaddu_wv_mask_v8i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v24, v8
 ; CHECK-NEXT:    li a0, 42
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, tu, mu
-; CHECK-NEXT:    vwaddu.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, tu, mu
+; CHECK-NEXT:    vwaddu.wv v8, v8, v24, v0.t
 ; CHECK-NEXT:    ret
     %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
@@ -39,13 +41,13 @@ define <vscale x 8 x i64> @vwaddu_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 define <vscale x 8 x i64> @vwaddu_vv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y) {
 ; CHECK-LABEL: vwaddu_vv_mask_v8i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v12
 ; CHECK-NEXT:    li a0, 42
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT:    vwaddu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmerge.vvm v20, v12, v8, v0
+; CHECK-NEXT:    vwaddu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
     %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
@@ -58,12 +60,13 @@ define <vscale x 8 x i64> @vwaddu_vv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 define <vscale x 8 x i64> @vwadd_wv_mask_v8i32_commutative(<vscale x 8 x i32> %x, <vscale x 8 x i64> %y) {
 ; CHECK-LABEL: vwadd_wv_mask_v8i32_commutative:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v24, v8
 ; CHECK-NEXT:    li a0, 42
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, tu, mu
-; CHECK-NEXT:    vwadd.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, tu, mu
+; CHECK-NEXT:    vwadd.wv v8, v8, v24, v0.t
 ; CHECK-NEXT:    ret
     %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
index ddc27f7562cdb..3f5d42f89337b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
@@ -110,8 +110,9 @@ define <vscale x 2 x i64> @vwadd_vv_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, <vsc
 ; CHECK-LABEL: vwadd_vv_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwadd.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwadd.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %vc = sext <vscale x 2 x i32> %va to <vscale x 2 x i64>
   %vd = sext <vscale x 2 x i32> %vb to <vscale x 2 x i64>
@@ -123,8 +124,9 @@ define <vscale x 2 x i64> @vwaddu_vv_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, <vs
 ; CHECK-LABEL: vwaddu_vv_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwaddu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %vc = zext <vscale x 2 x i32> %va to <vscale x 2 x i64>
   %vd = zext <vscale x 2 x i32> %vb to <vscale x 2 x i64>
@@ -136,8 +138,8 @@ define <vscale x 2 x i64> @vwadd_vx_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, i32
 ; CHECK-LABEL: vwadd_vx_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwadd.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwadd.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -151,8 +153,8 @@ define <vscale x 2 x i64> @vwaddu_vx_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, i32
 ; CHECK-LABEL: vwaddu_vx_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwaddu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -214,8 +216,9 @@ define <vscale x 4 x i64> @vwadd_vv_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, <vsc
 ; CHECK-LABEL: vwadd_vv_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwadd.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwadd.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %vc = sext <vscale x 4 x i32> %va to <vscale x 4 x i64>
   %vd = sext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
@@ -227,8 +230,9 @@ define <vscale x 4 x i64> @vwaddu_vv_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, <vs
 ; CHECK-LABEL: vwaddu_vv_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %vc = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
   %vd = zext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
@@ -240,8 +244,8 @@ define <vscale x 4 x i64> @vwadd_vx_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, i32
 ; CHECK-LABEL: vwadd_vx_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwadd.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwadd.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -255,8 +259,8 @@ define <vscale x 4 x i64> @vwaddu_vx_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, i32
 ; CHECK-LABEL: vwaddu_vx_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwaddu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -318,8 +322,9 @@ define <vscale x 8 x i64> @vwadd_vv_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, <vsc
 ; CHECK-LABEL: vwadd_vv_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwadd.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwadd.vv v8, v20, v16
 ; CHECK-NEXT:    ret
   %vc = sext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   %vd = sext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
@@ -331,8 +336,9 @@ define <vscale x 8 x i64> @vwaddu_vv_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, <vs
 ; CHECK-LABEL: vwaddu_vv_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwaddu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwaddu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
   %vc = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   %vd = zext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
@@ -344,8 +350,8 @@ define <vscale x 8 x i64> @vwadd_vx_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, i32
 ; CHECK-LABEL: vwadd_vx_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwadd.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwadd.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
@@ -359,8 +365,8 @@ define <vscale x 8 x i64> @vwaddu_vx_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, i32
 ; CHECK-LABEL: vwaddu_vx_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwaddu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwaddu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
@@ -744,10 +750,10 @@ define <vscale x 4 x i64> @vwadd_wx_nxv4i64_nxv4i16(<vscale x 4 x i64> %va, i16
 ; CHECK-LABEL: vwadd_wx_nxv4i64_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmv.v.x v14, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v14, v12
-; CHECK-NEXT:    vwadd.wv v8, v8, v14
+; CHECK-NEXT:    vsext.vf2 v12, v14
+; CHECK-NEXT:    vwadd.wv v8, v8, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> poison, i16 %b, i16 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -760,10 +766,10 @@ define <vscale x 4 x i64> @vwaddu_wx_nxv4i64_nxv4i16(<vscale x 4 x i64> %va, i16
 ; CHECK-LABEL: vwaddu_wx_nxv4i64_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmv.v.x v14, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v14, v12
-; CHECK-NEXT:    vwaddu.wv v8, v8, v14
+; CHECK-NEXT:    vzext.vf2 v12, v14
+; CHECK-NEXT:    vwaddu.wv v8, v8, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> poison, i16 %b, i16 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -862,10 +868,10 @@ define <vscale x 8 x i64> @vwadd_wx_nxv8i64_nxv8i16(<vscale x 8 x i64> %va, i16
 ; CHECK-LABEL: vwadd_wx_nxv8i64_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v20, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v20, v16
-; CHECK-NEXT:    vwadd.wv v8, v8, v20
+; CHECK-NEXT:    vsext.vf2 v16, v20
+; CHECK-NEXT:    vwadd.wv v8, v8, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i16> poison, i16 %b, i16 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -878,10 +884,10 @@ define <vscale x 8 x i64> @vwaddu_wx_nxv8i64_nxv8i16(<vscale x 8 x i64> %va, i16
 ; CHECK-LABEL: vwaddu_wx_nxv8i64_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v20, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v20, v16
-; CHECK-NEXT:    vwaddu.wv v8, v8, v20
+; CHECK-NEXT:    vzext.vf2 v16, v20
+; CHECK-NEXT:    vwaddu.wv v8, v8, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i16> poison, i16 %b, i16 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -1216,10 +1222,10 @@ define <vscale x 4 x i64> @vwadd_wx_nxv4i64_nxv4i8(<vscale x 4 x i64> %va, i8 %b
 ; CHECK-LABEL: vwadd_wx_nxv4i64_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmv.v.x v14, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsext.vf4 v14, v12
-; CHECK-NEXT:    vwadd.wv v8, v8, v14
+; CHECK-NEXT:    vsext.vf4 v12, v14
+; CHECK-NEXT:    vwadd.wv v8, v8, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> poison, i8 %b, i8 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -1232,10 +1238,10 @@ define <vscale x 4 x i64> @vwaddu_wx_nxv4i64_nxv4i8(<vscale x 4 x i64> %va, i8 %
 ; CHECK-LABEL: vwaddu_wx_nxv4i64_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmv.v.x v14, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf4 v14, v12
-; CHECK-NEXT:    vwaddu.wv v8, v8, v14
+; CHECK-NEXT:    vzext.vf4 v12, v14
+; CHECK-NEXT:    vwaddu.wv v8, v8, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> poison, i8 %b, i8 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -1334,10 +1340,10 @@ define <vscale x 8 x i64> @vwadd_wx_nxv8i64_nxv8i8(<vscale x 8 x i64> %va, i8 %b
 ; CHECK-LABEL: vwadd_wx_nxv8i64_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v20, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsext.vf4 v20, v16
-; CHECK-NEXT:    vwadd.wv v8, v8, v20
+; CHECK-NEXT:    vsext.vf4 v16, v20
+; CHECK-NEXT:    vwadd.wv v8, v8, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> poison, i8 %b, i8 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -1350,10 +1356,10 @@ define <vscale x 8 x i64> @vwaddu_wx_nxv8i64_nxv8i8(<vscale x 8 x i64> %va, i8 %
 ; CHECK-LABEL: vwaddu_wx_nxv8i64_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v20, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf4 v20, v16
-; CHECK-NEXT:    vwaddu.wv v8, v8, v20
+; CHECK-NEXT:    vzext.vf4 v16, v20
+; CHECK-NEXT:    vwaddu.wv v8, v8, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> poison, i8 %b, i8 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -1484,8 +1490,8 @@ define <vscale x 8 x i64> @vwadd_vx_splat_zext(<vscale x 8 x i32> %va, i32 %b) {
 ; RV64-LABEL: vwadd_vx_splat_zext:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV64-NEXT:    vwaddu.vx v16, v8, a0
-; RV64-NEXT:    vmv8r.v v8, v16
+; RV64-NEXT:    vmv4r.v v16, v8
+; RV64-NEXT:    vwaddu.vx v8, v16, a0
 ; RV64-NEXT:    ret
   %zb = zext i32 %b to i64
   %head = insertelement <vscale x 8 x i64> poison, i64 %zb, i32 0
@@ -1559,8 +1565,8 @@ define <vscale x 8 x i64> @vwadd_vx_splat_sext(<vscale x 8 x i32> %va, i32 %b) {
 ; CHECK-LABEL: vwadd_vx_splat_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwadd.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwadd.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %sb = sext i32 %b to i64
   %head = insertelement <vscale x 8 x i64> poison, i64 %sb, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd.ll
index c7df0378b3e34..a0ff37b600043 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd.ll
@@ -158,8 +158,9 @@ define <vscale x 8 x i16> @intrinsic_vwadd_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8
 ; CHECK-LABEL: intrinsic_vwadd_vv_nxv8i16_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwadd.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwadd.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwadd.nxv8i16.nxv8i8.nxv8i8(
@@ -206,8 +207,9 @@ define <vscale x 16 x i16> @intrinsic_vwadd_vv_nxv16i16_nxv16i8_nxv16i8(<vscale
 ; CHECK-LABEL: intrinsic_vwadd_vv_nxv16i16_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwadd.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwadd.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwadd.nxv16i16.nxv16i8.nxv16i8(
@@ -254,8 +256,9 @@ define <vscale x 32 x i16> @intrinsic_vwadd_vv_nxv32i16_nxv32i8_nxv32i8(<vscale
 ; CHECK-LABEL: intrinsic_vwadd_vv_nxv32i16_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwadd.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwadd.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwadd.nxv32i16.nxv32i8.nxv32i8(
@@ -398,8 +401,9 @@ define <vscale x 4 x i32> @intrinsic_vwadd_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwadd_vv_nxv4i32_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwadd.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwadd.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwadd.nxv4i32.nxv4i16.nxv4i16(
@@ -446,8 +450,9 @@ define <vscale x 8 x i32> @intrinsic_vwadd_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwadd_vv_nxv8i32_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwadd.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwadd.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwadd.nxv8i32.nxv8i16.nxv8i16(
@@ -494,8 +499,9 @@ define <vscale x 16 x i32> @intrinsic_vwadd_vv_nxv16i32_nxv16i16_nxv16i16(<vscal
 ; CHECK-LABEL: intrinsic_vwadd_vv_nxv16i32_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vwadd.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwadd.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwadd.nxv16i32.nxv16i16.nxv16i16(
@@ -590,8 +596,9 @@ define <vscale x 2 x i64> @intrinsic_vwadd_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwadd_vv_nxv2i64_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwadd.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwadd.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwadd.nxv2i64.nxv2i32.nxv2i32(
@@ -638,8 +645,9 @@ define <vscale x 4 x i64> @intrinsic_vwadd_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwadd_vv_nxv4i64_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwadd.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwadd.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.nxv4i32(
@@ -686,8 +694,9 @@ define <vscale x 8 x i64> @intrinsic_vwadd_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwadd_vv_nxv8i64_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwadd.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwadd.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwadd.nxv8i64.nxv8i32.nxv8i32(
@@ -878,8 +887,8 @@ define <vscale x 8 x i16> @intrinsic_vwadd_vx_nxv8i16_nxv8i8_i8(<vscale x 8 x i8
 ; CHECK-LABEL: intrinsic_vwadd_vx_nxv8i16_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vwadd.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwadd.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwadd.nxv8i16.nxv8i8.i8(
@@ -926,8 +935,8 @@ define <vscale x 16 x i16> @intrinsic_vwadd_vx_nxv16i16_nxv16i8_i8(<vscale x 16
 ; CHECK-LABEL: intrinsic_vwadd_vx_nxv16i16_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vwadd.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwadd.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwadd.nxv16i16.nxv16i8.i8(
@@ -974,8 +983,8 @@ define <vscale x 32 x i16> @intrinsic_vwadd_vx_nxv32i16_nxv32i8_i8(<vscale x 32
 ; CHECK-LABEL: intrinsic_vwadd_vx_nxv32i16_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vwadd.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwadd.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwadd.nxv32i16.nxv32i8.i8(
@@ -1118,8 +1127,8 @@ define <vscale x 4 x i32> @intrinsic_vwadd_vx_nxv4i32_nxv4i16_i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vwadd_vx_nxv4i32_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vwadd.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwadd.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwadd.nxv4i32.nxv4i16.i16(
@@ -1166,8 +1175,8 @@ define <vscale x 8 x i32> @intrinsic_vwadd_vx_nxv8i32_nxv8i16_i16(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwadd_vx_nxv8i32_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vwadd.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwadd.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwadd.nxv8i32.nxv8i16.i16(
@@ -1214,8 +1223,8 @@ define <vscale x 16 x i32> @intrinsic_vwadd_vx_nxv16i32_nxv16i16_i16(<vscale x 1
 ; CHECK-LABEL: intrinsic_vwadd_vx_nxv16i32_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vwadd.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwadd.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwadd.nxv16i32.nxv16i16.i16(
@@ -1310,8 +1319,8 @@ define <vscale x 2 x i64> @intrinsic_vwadd_vx_nxv2i64_nxv2i32_i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vwadd_vx_nxv2i64_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vwadd.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwadd.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwadd.nxv2i64.nxv2i32.i32(
@@ -1358,8 +1367,8 @@ define <vscale x 4 x i64> @intrinsic_vwadd_vx_nxv4i64_nxv4i32_i32(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vwadd_vx_nxv4i64_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vwadd.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwadd.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.i32(
@@ -1406,8 +1415,8 @@ define <vscale x 8 x i64> @intrinsic_vwadd_vx_nxv8i64_nxv8i32_i32(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwadd_vx_nxv8i64_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vwadd.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwadd.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwadd.nxv8i64.nxv8i32.i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd.w.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd.w.ll
index b1309f5291877..3850261e2117a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd.w.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd.w.ll
@@ -1986,8 +1986,8 @@ define <vscale x 8 x i16> @intrinsic_vwadd.w_wv_untie_nxv8i16_nxv8i16_nxv8i8(<vs
 ; CHECK-LABEL: intrinsic_vwadd.w_wv_untie_nxv8i16_nxv8i16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwadd.wv v12, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vwadd.wv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwadd.w.nxv8i16.nxv8i8(
@@ -2003,8 +2003,8 @@ define <vscale x 16 x i16> @intrinsic_vwadd.w_wv_untie_nxv16i16_nxv16i16_nxv16i8
 ; CHECK-LABEL: intrinsic_vwadd.w_wv_untie_nxv16i16_nxv16i16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwadd.wv v16, v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vwadd.wv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwadd.w.nxv16i16.nxv16i8(
@@ -2020,8 +2020,8 @@ define <vscale x 32 x i16> @intrinsic_vwadd.w_wv_untie_nxv32i16_nxv32i16_nxv32i8
 ; CHECK-LABEL: intrinsic_vwadd.w_wv_untie_nxv32i16_nxv32i16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwadd.wv v24, v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    vwadd.wv v8, v16, v24
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwadd.w.nxv32i16.nxv32i8(
@@ -2071,8 +2071,8 @@ define <vscale x 4 x i32> @intrinsic_vwadd.w_wv_untie_nxv4i32_nxv4i32_nxv4i16(<v
 ; CHECK-LABEL: intrinsic_vwadd.w_wv_untie_nxv4i32_nxv4i32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwadd.wv v12, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vwadd.wv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwadd.w.nxv4i32.nxv4i16(
@@ -2088,8 +2088,8 @@ define <vscale x 8 x i32> @intrinsic_vwadd.w_wv_untie_nxv8i32_nxv8i32_nxv8i16(<v
 ; CHECK-LABEL: intrinsic_vwadd.w_wv_untie_nxv8i32_nxv8i32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwadd.wv v16, v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vwadd.wv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwadd.w.nxv8i32.nxv8i16(
@@ -2122,8 +2122,8 @@ define <vscale x 2 x i64> @intrinsic_vwadd.w_wv_untie_nxv2i64_nxv2i64_nxv2i32(<v
 ; CHECK-LABEL: intrinsic_vwadd.w_wv_untie_nxv2i64_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwadd.wv v12, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vwadd.wv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwadd.w.nxv2i64.nxv2i32(
@@ -2139,8 +2139,8 @@ define <vscale x 4 x i64> @intrinsic_vwadd.w_wv_untie_nxv4i64_nxv4i64_nxv4i32(<v
 ; CHECK-LABEL: intrinsic_vwadd.w_wv_untie_nxv4i64_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwadd.wv v16, v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vwadd.wv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwadd.w.nxv4i64.nxv4i32(
@@ -2156,8 +2156,8 @@ define <vscale x 8 x i64> @intrinsic_vwadd.w_wv_untie_nxv8i64_nxv8i64_nxv8i32(<v
 ; CHECK-LABEL: intrinsic_vwadd.w_wv_untie_nxv8i64_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwadd.wv v24, v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    vwadd.wv v8, v16, v24
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwadd.w.nxv8i64.nxv8i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/vwaddu.ll
index 46932a13d22f0..4f82844c0edfc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwaddu.ll
@@ -158,8 +158,9 @@ define <vscale x 8 x i16> @intrinsic_vwaddu_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8
 ; CHECK-LABEL: intrinsic_vwaddu_vv_nxv8i16_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwaddu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwaddu.nxv8i16.nxv8i8.nxv8i8(
@@ -206,8 +207,9 @@ define <vscale x 16 x i16> @intrinsic_vwaddu_vv_nxv16i16_nxv16i8_nxv16i8(<vscale
 ; CHECK-LABEL: intrinsic_vwaddu_vv_nxv16i16_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwaddu.nxv16i16.nxv16i8.nxv16i8(
@@ -254,8 +256,9 @@ define <vscale x 32 x i16> @intrinsic_vwaddu_vv_nxv32i16_nxv32i8_nxv32i8(<vscale
 ; CHECK-LABEL: intrinsic_vwaddu_vv_nxv32i16_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwaddu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwaddu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwaddu.nxv32i16.nxv32i8.nxv32i8(
@@ -398,8 +401,9 @@ define <vscale x 4 x i32> @intrinsic_vwaddu_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwaddu_vv_nxv4i32_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwaddu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwaddu.nxv4i32.nxv4i16.nxv4i16(
@@ -446,8 +450,9 @@ define <vscale x 8 x i32> @intrinsic_vwaddu_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwaddu_vv_nxv8i32_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwaddu.nxv8i32.nxv8i16.nxv8i16(
@@ -494,8 +499,9 @@ define <vscale x 16 x i32> @intrinsic_vwaddu_vv_nxv16i32_nxv16i16_nxv16i16(<vsca
 ; CHECK-LABEL: intrinsic_vwaddu_vv_nxv16i32_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vwaddu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwaddu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwaddu.nxv16i32.nxv16i16.nxv16i16(
@@ -590,8 +596,9 @@ define <vscale x 2 x i64> @intrinsic_vwaddu_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwaddu_vv_nxv2i64_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwaddu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwaddu.nxv2i64.nxv2i32.nxv2i32(
@@ -638,8 +645,9 @@ define <vscale x 4 x i64> @intrinsic_vwaddu_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwaddu_vv_nxv4i64_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.nxv4i32(
@@ -686,8 +694,9 @@ define <vscale x 8 x i64> @intrinsic_vwaddu_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwaddu_vv_nxv8i64_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwaddu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwaddu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwaddu.nxv8i64.nxv8i32.nxv8i32(
@@ -878,8 +887,8 @@ define <vscale x 8 x i16> @intrinsic_vwaddu_vx_nxv8i16_nxv8i8_i8(<vscale x 8 x i
 ; CHECK-LABEL: intrinsic_vwaddu_vx_nxv8i16_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwaddu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwaddu.nxv8i16.nxv8i8.i8(
@@ -926,8 +935,8 @@ define <vscale x 16 x i16> @intrinsic_vwaddu_vx_nxv16i16_nxv16i8_i8(<vscale x 16
 ; CHECK-LABEL: intrinsic_vwaddu_vx_nxv16i16_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwaddu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwaddu.nxv16i16.nxv16i8.i8(
@@ -974,8 +983,8 @@ define <vscale x 32 x i16> @intrinsic_vwaddu_vx_nxv32i16_nxv32i8_i8(<vscale x 32
 ; CHECK-LABEL: intrinsic_vwaddu_vx_nxv32i16_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vwaddu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwaddu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwaddu.nxv32i16.nxv32i8.i8(
@@ -1118,8 +1127,8 @@ define <vscale x 4 x i32> @intrinsic_vwaddu_vx_nxv4i32_nxv4i16_i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vwaddu_vx_nxv4i32_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwaddu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwaddu.nxv4i32.nxv4i16.i16(
@@ -1166,8 +1175,8 @@ define <vscale x 8 x i32> @intrinsic_vwaddu_vx_nxv8i32_nxv8i16_i16(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwaddu_vx_nxv8i32_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwaddu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwaddu.nxv8i32.nxv8i16.i16(
@@ -1214,8 +1223,8 @@ define <vscale x 16 x i32> @intrinsic_vwaddu_vx_nxv16i32_nxv16i16_i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwaddu_vx_nxv16i32_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vwaddu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwaddu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwaddu.nxv16i32.nxv16i16.i16(
@@ -1310,8 +1319,8 @@ define <vscale x 2 x i64> @intrinsic_vwaddu_vx_nxv2i64_nxv2i32_i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vwaddu_vx_nxv2i64_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vwaddu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwaddu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwaddu.nxv2i64.nxv2i32.i32(
@@ -1358,8 +1367,8 @@ define <vscale x 4 x i64> @intrinsic_vwaddu_vx_nxv4i64_nxv4i32_i32(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vwaddu_vx_nxv4i64_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vwaddu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwaddu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.i32(
@@ -1406,8 +1415,8 @@ define <vscale x 8 x i64> @intrinsic_vwaddu_vx_nxv8i64_nxv8i32_i32(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwaddu_vx_nxv8i64_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vwaddu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwaddu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwaddu.nxv8i64.nxv8i32.i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwaddu.w.ll b/llvm/test/CodeGen/RISCV/rvv/vwaddu.w.ll
index 5a9a072129d31..8af403063f80a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwaddu.w.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwaddu.w.ll
@@ -1982,8 +1982,8 @@ define <vscale x 8 x i16> @intrinsic_vwaddu.w_wv_untie_nxv8i16_nxv8i16_nxv8i8(<v
 ; CHECK-LABEL: intrinsic_vwaddu.w_wv_untie_nxv8i16_nxv8i16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwaddu.wv v12, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vwaddu.wv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwaddu.w.nxv8i16.nxv8i8(
@@ -1999,8 +1999,8 @@ define <vscale x 16 x i16> @intrinsic_vwaddu.w_wv_untie_nxv16i16_nxv16i16_nxv16i
 ; CHECK-LABEL: intrinsic_vwaddu.w_wv_untie_nxv16i16_nxv16i16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwaddu.wv v16, v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vwaddu.wv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwaddu.w.nxv16i16.nxv16i8(
@@ -2016,8 +2016,8 @@ define <vscale x 32 x i16> @intrinsic_vwaddu.w_wv_untie_nxv32i16_nxv32i16_nxv32i
 ; CHECK-LABEL: intrinsic_vwaddu.w_wv_untie_nxv32i16_nxv32i16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwaddu.wv v24, v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    vwaddu.wv v8, v16, v24
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwaddu.w.nxv32i16.nxv32i8(
@@ -2067,8 +2067,8 @@ define <vscale x 4 x i32> @intrinsic_vwaddu.w_wv_untie_nxv4i32_nxv4i32_nxv4i16(<
 ; CHECK-LABEL: intrinsic_vwaddu.w_wv_untie_nxv4i32_nxv4i32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwaddu.wv v12, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vwaddu.wv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwaddu.w.nxv4i32.nxv4i16(
@@ -2084,8 +2084,8 @@ define <vscale x 8 x i32> @intrinsic_vwaddu.w_wv_untie_nxv8i32_nxv8i32_nxv8i16(<
 ; CHECK-LABEL: intrinsic_vwaddu.w_wv_untie_nxv8i32_nxv8i32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwaddu.wv v16, v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vwaddu.wv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwaddu.w.nxv8i32.nxv8i16(
@@ -2118,8 +2118,8 @@ define <vscale x 2 x i64> @intrinsic_vwaddu.w_wv_untie_nxv2i64_nxv2i64_nxv2i32(<
 ; CHECK-LABEL: intrinsic_vwaddu.w_wv_untie_nxv2i64_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwaddu.wv v12, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vwaddu.wv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwaddu.w.nxv2i64.nxv2i32(
@@ -2135,8 +2135,8 @@ define <vscale x 4 x i64> @intrinsic_vwaddu.w_wv_untie_nxv4i64_nxv4i64_nxv4i32(<
 ; CHECK-LABEL: intrinsic_vwaddu.w_wv_untie_nxv4i64_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwaddu.wv v16, v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vwaddu.wv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwaddu.w.nxv4i64.nxv4i32(
@@ -2152,8 +2152,8 @@ define <vscale x 8 x i64> @intrinsic_vwaddu.w_wv_untie_nxv8i64_nxv8i64_nxv8i32(<
 ; CHECK-LABEL: intrinsic_vwaddu.w_wv_untie_nxv8i64_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwaddu.wv v24, v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    vwaddu.wv v8, v16, v24
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwaddu.w.nxv8i64.nxv8i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwmul-sdnode.ll
index 28fc53f37ba1d..b55a7f610aad5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwmul-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwmul-sdnode.ll
@@ -90,8 +90,9 @@ define <vscale x 2 x i64> @vwmul_vv_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, <vsc
 ; CHECK-LABEL: vwmul_vv_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwmul.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwmul.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %vc = sext <vscale x 2 x i32> %va to <vscale x 2 x i64>
   %vd = sext <vscale x 2 x i32> %vb to <vscale x 2 x i64>
@@ -103,8 +104,9 @@ define <vscale x 2 x i64> @vwmulu_vv_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, <vs
 ; CHECK-LABEL: vwmulu_vv_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwmulu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %vc = zext <vscale x 2 x i32> %va to <vscale x 2 x i64>
   %vd = zext <vscale x 2 x i32> %vb to <vscale x 2 x i64>
@@ -116,8 +118,9 @@ define <vscale x 2 x i64> @vwmulsu_vv_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, <v
 ; CHECK-LABEL: vwmulsu_vv_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwmulsu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwmulsu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %vc = sext <vscale x 2 x i32> %va to <vscale x 2 x i64>
   %vd = zext <vscale x 2 x i32> %vb to <vscale x 2 x i64>
@@ -129,8 +132,8 @@ define <vscale x 2 x i64> @vwmul_vx_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, i32
 ; CHECK-LABEL: vwmul_vx_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwmul.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwmul.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
@@ -144,8 +147,8 @@ define <vscale x 2 x i64> @vwmulu_vx_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, i32
 ; CHECK-LABEL: vwmulu_vx_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
@@ -159,8 +162,8 @@ define <vscale x 2 x i64> @vwmulsu_vx_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, i3
 ; CHECK-LABEL: vwmulsu_vx_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwmulsu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwmulsu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
@@ -174,8 +177,9 @@ define <vscale x 4 x i64> @vwmul_vv_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, <vsc
 ; CHECK-LABEL: vwmul_vv_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwmul.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwmul.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %vc = sext <vscale x 4 x i32> %va to <vscale x 4 x i64>
   %vd = sext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
@@ -187,8 +191,9 @@ define <vscale x 4 x i64> @vwmulu_vv_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, <vs
 ; CHECK-LABEL: vwmulu_vv_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwmulu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwmulu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %vc = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
   %vd = zext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
@@ -200,8 +205,9 @@ define <vscale x 4 x i64> @vwmulsu_vv_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, <v
 ; CHECK-LABEL: vwmulsu_vv_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwmulsu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwmulsu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %vc = sext <vscale x 4 x i32> %va to <vscale x 4 x i64>
   %vd = zext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
@@ -213,8 +219,8 @@ define <vscale x 4 x i64> @vwmul_vx_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, i32
 ; CHECK-LABEL: vwmul_vx_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwmul.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwmul.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -228,8 +234,8 @@ define <vscale x 4 x i64> @vwmulu_vx_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, i32
 ; CHECK-LABEL: vwmulu_vx_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwmulu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwmulu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -243,8 +249,8 @@ define <vscale x 4 x i64> @vwmulsu_vx_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, i3
 ; CHECK-LABEL: vwmulsu_vx_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwmulsu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwmulsu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -258,8 +264,9 @@ define <vscale x 8 x i64> @vwmul_vv_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, <vsc
 ; CHECK-LABEL: vwmul_vv_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwmul.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwmul.vv v8, v20, v16
 ; CHECK-NEXT:    ret
   %vc = sext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   %vd = sext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
@@ -271,8 +278,9 @@ define <vscale x 8 x i64> @vwmulu_vv_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, <vs
 ; CHECK-LABEL: vwmulu_vv_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwmulu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwmulu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
   %vc = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   %vd = zext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
@@ -284,8 +292,9 @@ define <vscale x 8 x i64> @vwmulsu_vv_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, <v
 ; CHECK-LABEL: vwmulsu_vv_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwmulsu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwmulsu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
   %vc = sext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   %vd = zext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
@@ -297,8 +306,8 @@ define <vscale x 8 x i64> @vwmul_vx_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, i32
 ; CHECK-LABEL: vwmul_vx_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwmul.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwmul.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> undef, <vscale x 8 x i32> zeroinitializer
@@ -312,8 +321,8 @@ define <vscale x 8 x i64> @vwmulu_vx_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, i32
 ; CHECK-LABEL: vwmulu_vx_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwmulu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwmulu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> undef, <vscale x 8 x i32> zeroinitializer
@@ -327,8 +336,8 @@ define <vscale x 8 x i64> @vwmulsu_vx_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, i3
 ; CHECK-LABEL: vwmulsu_vx_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwmulsu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwmulsu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> undef, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/vwmul.ll
index f88765dc1e252..3c8b6c8c4b561 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwmul.ll
@@ -158,8 +158,9 @@ define <vscale x 8 x i16> @intrinsic_vwmul_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8
 ; CHECK-LABEL: intrinsic_vwmul_vv_nxv8i16_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwmul.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwmul.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwmul.nxv8i16.nxv8i8.nxv8i8(
@@ -206,8 +207,9 @@ define <vscale x 16 x i16> @intrinsic_vwmul_vv_nxv16i16_nxv16i8_nxv16i8(<vscale
 ; CHECK-LABEL: intrinsic_vwmul_vv_nxv16i16_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwmul.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwmul.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwmul.nxv16i16.nxv16i8.nxv16i8(
@@ -254,8 +256,9 @@ define <vscale x 32 x i16> @intrinsic_vwmul_vv_nxv32i16_nxv32i8_nxv32i8(<vscale
 ; CHECK-LABEL: intrinsic_vwmul_vv_nxv32i16_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwmul.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwmul.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwmul.nxv32i16.nxv32i8.nxv32i8(
@@ -398,8 +401,9 @@ define <vscale x 4 x i32> @intrinsic_vwmul_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwmul_vv_nxv4i32_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwmul.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwmul.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwmul.nxv4i32.nxv4i16.nxv4i16(
@@ -446,8 +450,9 @@ define <vscale x 8 x i32> @intrinsic_vwmul_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwmul_vv_nxv8i32_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwmul.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwmul.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwmul.nxv8i32.nxv8i16.nxv8i16(
@@ -494,8 +499,9 @@ define <vscale x 16 x i32> @intrinsic_vwmul_vv_nxv16i32_nxv16i16_nxv16i16(<vscal
 ; CHECK-LABEL: intrinsic_vwmul_vv_nxv16i32_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vwmul.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwmul.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwmul.nxv16i32.nxv16i16.nxv16i16(
@@ -590,8 +596,9 @@ define <vscale x 2 x i64> @intrinsic_vwmul_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwmul_vv_nxv2i64_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwmul.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwmul.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwmul.nxv2i64.nxv2i32.nxv2i32(
@@ -638,8 +645,9 @@ define <vscale x 4 x i64> @intrinsic_vwmul_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwmul_vv_nxv4i64_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwmul.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwmul.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwmul.nxv4i64.nxv4i32.nxv4i32(
@@ -686,8 +694,9 @@ define <vscale x 8 x i64> @intrinsic_vwmul_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwmul_vv_nxv8i64_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwmul.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwmul.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwmul.nxv8i64.nxv8i32.nxv8i32(
@@ -878,8 +887,8 @@ define <vscale x 8 x i16> @intrinsic_vwmul_vx_nxv8i16_nxv8i8_i8(<vscale x 8 x i8
 ; CHECK-LABEL: intrinsic_vwmul_vx_nxv8i16_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vwmul.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwmul.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwmul.nxv8i16.nxv8i8.i8(
@@ -926,8 +935,8 @@ define <vscale x 16 x i16> @intrinsic_vwmul_vx_nxv16i16_nxv16i8_i8(<vscale x 16
 ; CHECK-LABEL: intrinsic_vwmul_vx_nxv16i16_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vwmul.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwmul.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwmul.nxv16i16.nxv16i8.i8(
@@ -974,8 +983,8 @@ define <vscale x 32 x i16> @intrinsic_vwmul_vx_nxv32i16_nxv32i8_i8(<vscale x 32
 ; CHECK-LABEL: intrinsic_vwmul_vx_nxv32i16_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vwmul.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwmul.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwmul.nxv32i16.nxv32i8.i8(
@@ -1118,8 +1127,8 @@ define <vscale x 4 x i32> @intrinsic_vwmul_vx_nxv4i32_nxv4i16_i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vwmul_vx_nxv4i32_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vwmul.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwmul.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwmul.nxv4i32.nxv4i16.i16(
@@ -1166,8 +1175,8 @@ define <vscale x 8 x i32> @intrinsic_vwmul_vx_nxv8i32_nxv8i16_i16(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwmul_vx_nxv8i32_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vwmul.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwmul.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwmul.nxv8i32.nxv8i16.i16(
@@ -1214,8 +1223,8 @@ define <vscale x 16 x i32> @intrinsic_vwmul_vx_nxv16i32_nxv16i16_i16(<vscale x 1
 ; CHECK-LABEL: intrinsic_vwmul_vx_nxv16i32_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vwmul.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwmul.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwmul.nxv16i32.nxv16i16.i16(
@@ -1310,8 +1319,8 @@ define <vscale x 2 x i64> @intrinsic_vwmul_vx_nxv2i64_nxv2i32_i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vwmul_vx_nxv2i64_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vwmul.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwmul.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwmul.nxv2i64.nxv2i32.i32(
@@ -1358,8 +1367,8 @@ define <vscale x 4 x i64> @intrinsic_vwmul_vx_nxv4i64_nxv4i32_i32(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vwmul_vx_nxv4i64_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vwmul.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwmul.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwmul.nxv4i64.nxv4i32.i32(
@@ -1406,8 +1415,8 @@ define <vscale x 8 x i64> @intrinsic_vwmul_vx_nxv8i64_nxv8i32_i32(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwmul_vx_nxv8i64_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vwmul.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwmul.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwmul.nxv8i64.nxv8i32.i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/vwmulsu.ll
index a44be068b0a65..fa0f935513493 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwmulsu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwmulsu.ll
@@ -158,8 +158,9 @@ define <vscale x 8 x i16> @intrinsic_vwmulsu_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x
 ; CHECK-LABEL: intrinsic_vwmulsu_vv_nxv8i16_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwmulsu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwmulsu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwmulsu.nxv8i16.nxv8i8.nxv8i8(
@@ -206,8 +207,9 @@ define <vscale x 16 x i16> @intrinsic_vwmulsu_vv_nxv16i16_nxv16i8_nxv16i8(<vscal
 ; CHECK-LABEL: intrinsic_vwmulsu_vv_nxv16i16_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwmulsu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwmulsu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwmulsu.nxv16i16.nxv16i8.nxv16i8(
@@ -254,8 +256,9 @@ define <vscale x 32 x i16> @intrinsic_vwmulsu_vv_nxv32i16_nxv32i8_nxv32i8(<vscal
 ; CHECK-LABEL: intrinsic_vwmulsu_vv_nxv32i16_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwmulsu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwmulsu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwmulsu.nxv32i16.nxv32i8.nxv32i8(
@@ -398,8 +401,9 @@ define <vscale x 4 x i32> @intrinsic_vwmulsu_vv_nxv4i32_nxv4i16_nxv4i16(<vscale
 ; CHECK-LABEL: intrinsic_vwmulsu_vv_nxv4i32_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwmulsu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwmulsu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwmulsu.nxv4i32.nxv4i16.nxv4i16(
@@ -446,8 +450,9 @@ define <vscale x 8 x i32> @intrinsic_vwmulsu_vv_nxv8i32_nxv8i16_nxv8i16(<vscale
 ; CHECK-LABEL: intrinsic_vwmulsu_vv_nxv8i32_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwmulsu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwmulsu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwmulsu.nxv8i32.nxv8i16.nxv8i16(
@@ -494,8 +499,9 @@ define <vscale x 16 x i32> @intrinsic_vwmulsu_vv_nxv16i32_nxv16i16_nxv16i16(<vsc
 ; CHECK-LABEL: intrinsic_vwmulsu_vv_nxv16i32_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vwmulsu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwmulsu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwmulsu.nxv16i32.nxv16i16.nxv16i16(
@@ -590,8 +596,9 @@ define <vscale x 2 x i64> @intrinsic_vwmulsu_vv_nxv2i64_nxv2i32_nxv2i32(<vscale
 ; CHECK-LABEL: intrinsic_vwmulsu_vv_nxv2i64_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwmulsu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwmulsu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwmulsu.nxv2i64.nxv2i32.nxv2i32(
@@ -638,8 +645,9 @@ define <vscale x 4 x i64> @intrinsic_vwmulsu_vv_nxv4i64_nxv4i32_nxv4i32(<vscale
 ; CHECK-LABEL: intrinsic_vwmulsu_vv_nxv4i64_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwmulsu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwmulsu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwmulsu.nxv4i64.nxv4i32.nxv4i32(
@@ -686,8 +694,9 @@ define <vscale x 8 x i64> @intrinsic_vwmulsu_vv_nxv8i64_nxv8i32_nxv8i32(<vscale
 ; CHECK-LABEL: intrinsic_vwmulsu_vv_nxv8i64_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwmulsu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwmulsu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwmulsu.nxv8i64.nxv8i32.nxv8i32(
@@ -878,8 +887,8 @@ define <vscale x 8 x i16> @intrinsic_vwmulsu_vx_nxv8i16_nxv8i8_i8(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwmulsu_vx_nxv8i16_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vwmulsu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwmulsu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwmulsu.nxv8i16.nxv8i8.i8(
@@ -926,8 +935,8 @@ define <vscale x 16 x i16> @intrinsic_vwmulsu_vx_nxv16i16_nxv16i8_i8(<vscale x 1
 ; CHECK-LABEL: intrinsic_vwmulsu_vx_nxv16i16_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vwmulsu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwmulsu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwmulsu.nxv16i16.nxv16i8.i8(
@@ -974,8 +983,8 @@ define <vscale x 32 x i16> @intrinsic_vwmulsu_vx_nxv32i16_nxv32i8_i8(<vscale x 3
 ; CHECK-LABEL: intrinsic_vwmulsu_vx_nxv32i16_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vwmulsu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwmulsu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwmulsu.nxv32i16.nxv32i8.i8(
@@ -1118,8 +1127,8 @@ define <vscale x 4 x i32> @intrinsic_vwmulsu_vx_nxv4i32_nxv4i16_i16(<vscale x 4
 ; CHECK-LABEL: intrinsic_vwmulsu_vx_nxv4i32_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vwmulsu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwmulsu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwmulsu.nxv4i32.nxv4i16.i16(
@@ -1166,8 +1175,8 @@ define <vscale x 8 x i32> @intrinsic_vwmulsu_vx_nxv8i32_nxv8i16_i16(<vscale x 8
 ; CHECK-LABEL: intrinsic_vwmulsu_vx_nxv8i32_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vwmulsu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwmulsu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwmulsu.nxv8i32.nxv8i16.i16(
@@ -1214,8 +1223,8 @@ define <vscale x 16 x i32> @intrinsic_vwmulsu_vx_nxv16i32_nxv16i16_i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwmulsu_vx_nxv16i32_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vwmulsu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwmulsu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwmulsu.nxv16i32.nxv16i16.i16(
@@ -1310,8 +1319,8 @@ define <vscale x 2 x i64> @intrinsic_vwmulsu_vx_nxv2i64_nxv2i32_i32(<vscale x 2
 ; CHECK-LABEL: intrinsic_vwmulsu_vx_nxv2i64_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vwmulsu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwmulsu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwmulsu.nxv2i64.nxv2i32.i32(
@@ -1358,8 +1367,8 @@ define <vscale x 4 x i64> @intrinsic_vwmulsu_vx_nxv4i64_nxv4i32_i32(<vscale x 4
 ; CHECK-LABEL: intrinsic_vwmulsu_vx_nxv4i64_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vwmulsu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwmulsu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwmulsu.nxv4i64.nxv4i32.i32(
@@ -1406,8 +1415,8 @@ define <vscale x 8 x i64> @intrinsic_vwmulsu_vx_nxv8i64_nxv8i32_i32(<vscale x 8
 ; CHECK-LABEL: intrinsic_vwmulsu_vx_nxv8i64_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vwmulsu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwmulsu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwmulsu.nxv8i64.nxv8i32.i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/vwmulu.ll
index c1281ec4d1be2..8c9f245921b66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwmulu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwmulu.ll
@@ -158,8 +158,9 @@ define <vscale x 8 x i16> @intrinsic_vwmulu_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8
 ; CHECK-LABEL: intrinsic_vwmulu_vv_nxv8i16_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwmulu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwmulu.nxv8i16.nxv8i8.nxv8i8(
@@ -206,8 +207,9 @@ define <vscale x 16 x i16> @intrinsic_vwmulu_vv_nxv16i16_nxv16i8_nxv16i8(<vscale
 ; CHECK-LABEL: intrinsic_vwmulu_vv_nxv16i16_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwmulu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwmulu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwmulu.nxv16i16.nxv16i8.nxv16i8(
@@ -254,8 +256,9 @@ define <vscale x 32 x i16> @intrinsic_vwmulu_vv_nxv32i16_nxv32i8_nxv32i8(<vscale
 ; CHECK-LABEL: intrinsic_vwmulu_vv_nxv32i16_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwmulu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwmulu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwmulu.nxv32i16.nxv32i8.nxv32i8(
@@ -398,8 +401,9 @@ define <vscale x 4 x i32> @intrinsic_vwmulu_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwmulu_vv_nxv4i32_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwmulu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwmulu.nxv4i32.nxv4i16.nxv4i16(
@@ -446,8 +450,9 @@ define <vscale x 8 x i32> @intrinsic_vwmulu_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwmulu_vv_nxv8i32_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwmulu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwmulu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwmulu.nxv8i32.nxv8i16.nxv8i16(
@@ -494,8 +499,9 @@ define <vscale x 16 x i32> @intrinsic_vwmulu_vv_nxv16i32_nxv16i16_nxv16i16(<vsca
 ; CHECK-LABEL: intrinsic_vwmulu_vv_nxv16i32_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vwmulu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwmulu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwmulu.nxv16i32.nxv16i16.nxv16i16(
@@ -590,8 +596,9 @@ define <vscale x 2 x i64> @intrinsic_vwmulu_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwmulu_vv_nxv2i64_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwmulu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwmulu.nxv2i64.nxv2i32.nxv2i32(
@@ -638,8 +645,9 @@ define <vscale x 4 x i64> @intrinsic_vwmulu_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwmulu_vv_nxv4i64_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwmulu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwmulu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwmulu.nxv4i64.nxv4i32.nxv4i32(
@@ -686,8 +694,9 @@ define <vscale x 8 x i64> @intrinsic_vwmulu_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwmulu_vv_nxv8i64_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwmulu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwmulu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwmulu.nxv8i64.nxv8i32.nxv8i32(
@@ -878,8 +887,8 @@ define <vscale x 8 x i16> @intrinsic_vwmulu_vx_nxv8i16_nxv8i8_i8(<vscale x 8 x i
 ; CHECK-LABEL: intrinsic_vwmulu_vx_nxv8i16_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwmulu.nxv8i16.nxv8i8.i8(
@@ -926,8 +935,8 @@ define <vscale x 16 x i16> @intrinsic_vwmulu_vx_nxv16i16_nxv16i8_i8(<vscale x 16
 ; CHECK-LABEL: intrinsic_vwmulu_vx_nxv16i16_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vwmulu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwmulu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwmulu.nxv16i16.nxv16i8.i8(
@@ -974,8 +983,8 @@ define <vscale x 32 x i16> @intrinsic_vwmulu_vx_nxv32i16_nxv32i8_i8(<vscale x 32
 ; CHECK-LABEL: intrinsic_vwmulu_vx_nxv32i16_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vwmulu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwmulu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwmulu.nxv32i16.nxv32i8.i8(
@@ -1118,8 +1127,8 @@ define <vscale x 4 x i32> @intrinsic_vwmulu_vx_nxv4i32_nxv4i16_i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vwmulu_vx_nxv4i32_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwmulu.nxv4i32.nxv4i16.i16(
@@ -1166,8 +1175,8 @@ define <vscale x 8 x i32> @intrinsic_vwmulu_vx_nxv8i32_nxv8i16_i16(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwmulu_vx_nxv8i32_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vwmulu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwmulu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwmulu.nxv8i32.nxv8i16.i16(
@@ -1214,8 +1223,8 @@ define <vscale x 16 x i32> @intrinsic_vwmulu_vx_nxv16i32_nxv16i16_i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwmulu_vx_nxv16i32_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vwmulu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwmulu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwmulu.nxv16i32.nxv16i16.i16(
@@ -1310,8 +1319,8 @@ define <vscale x 2 x i64> @intrinsic_vwmulu_vx_nxv2i64_nxv2i32_i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vwmulu_vx_nxv2i64_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwmulu.nxv2i64.nxv2i32.i32(
@@ -1358,8 +1367,8 @@ define <vscale x 4 x i64> @intrinsic_vwmulu_vx_nxv4i64_nxv4i32_i32(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vwmulu_vx_nxv4i64_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vwmulu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwmulu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwmulu.nxv4i64.nxv4i32.i32(
@@ -1406,8 +1415,8 @@ define <vscale x 8 x i64> @intrinsic_vwmulu_vx_nxv8i64_nxv8i32_i32(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwmulu_vx_nxv8i64_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vwmulu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwmulu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwmulu.nxv8i64.nxv8i32.i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
index fd09fe791b4fd..381f57e59aa76 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
@@ -20,8 +20,9 @@ define <vscale x 2 x i64> @vwsll_vv_nxv2i64_sext(<vscale x 2 x i32> %a, <vscale
 ; CHECK-ZVBB-LABEL: vwsll_vv_nxv2i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
   %y = sext <vscale x 2 x i32> %b to <vscale x 2 x i64>
@@ -41,8 +42,9 @@ define <vscale x 2 x i64> @vwsll_vv_nxv2i64_zext(<vscale x 2 x i32> %a, <vscale
 ; CHECK-ZVBB-LABEL: vwsll_vv_nxv2i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
   %y = zext <vscale x 2 x i32> %b to <vscale x 2 x i64>
@@ -61,8 +63,8 @@ define <vscale x 2 x i64> @vwsll_vx_i64_nxv2i64(<vscale x 2 x i32> %a, i64 %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_nxv2i64:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
@@ -75,18 +77,18 @@ define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_sext(<vscale x 2 x i32> %a, i32
 ; CHECK-LABEL: vwsll_vx_i32_nxv2i64_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -100,18 +102,18 @@ define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_zext(<vscale x 2 x i32> %a, i32
 ; CHECK-LABEL: vwsll_vx_i32_nxv2i64_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -125,18 +127,18 @@ define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_sext(<vscale x 2 x i32> %a, i16
 ; CHECK-LABEL: vwsll_vx_i16_nxv2i64_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
@@ -150,18 +152,18 @@ define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_zext(<vscale x 2 x i32> %a, i16
 ; CHECK-LABEL: vwsll_vx_i16_nxv2i64_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
@@ -175,18 +177,18 @@ define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_sext(<vscale x 2 x i32> %a, i8 %b
 ; CHECK-LABEL: vwsll_vx_i8_nxv2i64_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf8 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf8 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
@@ -200,18 +202,18 @@ define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_zext(<vscale x 2 x i32> %a, i8 %b
 ; CHECK-LABEL: vwsll_vx_i8_nxv2i64_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf8 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf8 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
@@ -224,17 +226,17 @@ define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_zext(<vscale x 2 x i32> %a, i8 %b
 define <vscale x 2 x i64> @vwsll_vi_nxv2i64(<vscale x 2 x i32> %a) {
 ; CHECK-LABEL: vwsll_vi_nxv2i64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    li a0, 4
-; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_nxv2i64:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vi v8, v10, 2
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
   %z = shl <vscale x 2 x i64> %x, splat (i64 2)
@@ -257,8 +259,9 @@ define <vscale x 4 x i32> @vwsll_vv_nxv4i32_sext(<vscale x 4 x i16> %a, <vscale
 ; CHECK-ZVBB-LABEL: vwsll_vv_nxv4i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
   %y = sext <vscale x 4 x i16> %b to <vscale x 4 x i32>
@@ -278,8 +281,9 @@ define <vscale x 4 x i32> @vwsll_vv_nxv4i32_zext(<vscale x 4 x i16> %a, <vscale
 ; CHECK-ZVBB-LABEL: vwsll_vv_nxv4i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
   %y = zext <vscale x 4 x i16> %b to <vscale x 4 x i32>
@@ -298,8 +302,8 @@ define <vscale x 4 x i32> @vwsll_vx_i64_nxv4i32(<vscale x 4 x i16> %a, i64 %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_nxv4i32:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -320,8 +324,8 @@ define <vscale x 4 x i32> @vwsll_vx_i32_nxv4i32(<vscale x 4 x i16> %a, i32 %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv4i32:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -334,18 +338,18 @@ define <vscale x 4 x i32> @vwsll_vx_i16_nxv4i32_sext(<vscale x 4 x i16> %a, i16
 ; CHECK-LABEL: vwsll_vx_i16_nxv4i32_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv4i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -359,18 +363,18 @@ define <vscale x 4 x i32> @vwsll_vx_i16_nxv4i32_zext(<vscale x 4 x i16> %a, i16
 ; CHECK-LABEL: vwsll_vx_i16_nxv4i32_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv4i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -384,18 +388,18 @@ define <vscale x 4 x i32> @vwsll_vx_i8_nxv4i32_sext(<vscale x 4 x i16> %a, i8 %b
 ; CHECK-LABEL: vwsll_vx_i8_nxv4i32_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv4i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -409,18 +413,18 @@ define <vscale x 4 x i32> @vwsll_vx_i8_nxv4i32_zext(<vscale x 4 x i16> %a, i8 %b
 ; CHECK-LABEL: vwsll_vx_i8_nxv4i32_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv4i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -433,17 +437,17 @@ define <vscale x 4 x i32> @vwsll_vx_i8_nxv4i32_zext(<vscale x 4 x i16> %a, i8 %b
 define <vscale x 4 x i32> @vwsll_vi_nxv4i32(<vscale x 4 x i16> %a) {
 ; CHECK-LABEL: vwsll_vi_nxv4i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    li a0, 4
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_nxv4i32:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vi v8, v10, 2
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
   %z = shl <vscale x 4 x i32> %x, splat (i32 2)
@@ -466,8 +470,9 @@ define <vscale x 8 x i16> @vwsll_vv_nxv8i16_sext(<vscale x 8 x i8> %a, <vscale x
 ; CHECK-ZVBB-LABEL: vwsll_vv_nxv8i16_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %y = sext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -487,8 +492,9 @@ define <vscale x 8 x i16> @vwsll_vv_nxv8i16_zext(<vscale x 8 x i8> %a, <vscale x
 ; CHECK-ZVBB-LABEL: vwsll_vv_nxv8i16_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %y = zext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -507,8 +513,8 @@ define <vscale x 8 x i16> @vwsll_vx_i64_nxv8i16(<vscale x 8 x i8> %a, i64 %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_nxv8i16:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> poison, <vscale x 8 x i32> zeroinitializer
@@ -529,8 +535,8 @@ define <vscale x 8 x i16> @vwsll_vx_i32_nxv8i16(<vscale x 8 x i8> %a, i32 %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv8i16:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
@@ -551,8 +557,8 @@ define <vscale x 8 x i16> @vwsll_vx_i16_nxv8i16(<vscale x 8 x i8> %a, i16 %b) {
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv8i16:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -565,18 +571,18 @@ define <vscale x 8 x i16> @vwsll_vx_i8_nxv8i16_sext(<vscale x 8 x i8> %a, i8 %b)
 ; CHECK-LABEL: vwsll_vx_i8_nxv8i16_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv8i16_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -590,18 +596,18 @@ define <vscale x 8 x i16> @vwsll_vx_i8_nxv8i16_zext(<vscale x 8 x i8> %a, i8 %b)
 ; CHECK-LABEL: vwsll_vx_i8_nxv8i16_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv8i16_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -614,17 +620,17 @@ define <vscale x 8 x i16> @vwsll_vx_i8_nxv8i16_zext(<vscale x 8 x i8> %a, i8 %b)
 define <vscale x 8 x i16> @vwsll_vi_nxv8i16(<vscale x 8 x i8> %a) {
 ; CHECK-LABEL: vwsll_vi_nxv8i16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    li a0, 4
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_nxv8i16:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vi v8, v10, 2
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %z = shl <vscale x 8 x i16> %x, splat (i16 2)
@@ -704,21 +710,21 @@ define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a
 ; CHECK-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vsext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vmv.v.x v12, a0
 ; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsext.vf2 v8, v12
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -732,11 +738,11 @@ define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a
 ; CHECK-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_zext:
@@ -757,21 +763,21 @@ define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a
 ; CHECK-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vsext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vmv.v.x v12, a0
 ; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsext.vf4 v8, v12
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
@@ -785,11 +791,11 @@ define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a
 ; CHECK-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vzext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_zext:
@@ -810,21 +816,21 @@ define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a,
 ; CHECK-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vsext.vf8 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vsext.vf8 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vmv.v.x v12, a0
 ; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsext.vf8 v8, v12
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
@@ -838,11 +844,11 @@ define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a,
 ; CHECK-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vzext.vf8 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    vzext.vf8 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_zext:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll
index 1358a7c69cb8a..87bb5fa0238ce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll
@@ -22,8 +22,9 @@ define <vscale x 2 x i64> @vwsll_vv_nxv2i64_sext(<vscale x 2 x i32> %a, <vscale
 ; CHECK-ZVBB-LABEL: vwsll_vv_nxv2i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
   %y = sext <vscale x 2 x i32> %b to <vscale x 2 x i64>
@@ -43,8 +44,9 @@ define <vscale x 2 x i64> @vwsll_vv_nxv2i64_zext(<vscale x 2 x i32> %a, <vscale
 ; CHECK-ZVBB-LABEL: vwsll_vv_nxv2i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
   %y = zext <vscale x 2 x i32> %b to <vscale x 2 x i64>
@@ -70,15 +72,15 @@ define <vscale x 2 x i64> @vwsll_vx_i64_nxv2i64(<vscale x 2 x i32> %a, i64 %b, <
 ; CHECK-ZVBB32-LABEL: vwsll_vx_i64_nxv2i64:
 ; CHECK-ZVBB32:       # %bb.0:
 ; CHECK-ZVBB32-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-ZVBB32-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB32-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB32-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB32-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB32-NEXT:    ret
 ;
 ; CHECK-ZVBB64-LABEL: vwsll_vx_i64_nxv2i64:
 ; CHECK-ZVBB64:       # %bb.0:
 ; CHECK-ZVBB64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-ZVBB64-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB64-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB64-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB64-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB64-NEXT:    ret
   %head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
@@ -91,18 +93,18 @@ define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_sext(<vscale x 2 x i32> %a, i32
 ; CHECK-LABEL: vwsll_vx_i32_nxv2i64_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -116,18 +118,18 @@ define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_zext(<vscale x 2 x i32> %a, i32
 ; CHECK-LABEL: vwsll_vx_i32_nxv2i64_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -141,18 +143,18 @@ define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_sext(<vscale x 2 x i32> %a, i16
 ; CHECK-LABEL: vwsll_vx_i16_nxv2i64_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vsext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
@@ -166,18 +168,18 @@ define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_zext(<vscale x 2 x i32> %a, i16
 ; CHECK-LABEL: vwsll_vx_i16_nxv2i64_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
@@ -191,18 +193,18 @@ define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_sext(<vscale x 2 x i32> %a, i8 %b
 ; CHECK-LABEL: vwsll_vx_i8_nxv2i64_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf8 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vsext.vf8 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
@@ -216,18 +218,18 @@ define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_zext(<vscale x 2 x i32> %a, i8 %b
 ; CHECK-LABEL: vwsll_vx_i8_nxv2i64_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf8 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vzext.vf8 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
@@ -240,17 +242,17 @@ define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_zext(<vscale x 2 x i32> %a, i8 %b
 define <vscale x 2 x i64> @vwsll_vi_nxv2i64(<vscale x 2 x i32> %a, <vscale x 2 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vwsll_vi_nxv2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 4
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a1, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_nxv2i64:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vi v8, v10, 2, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
   %z = call <vscale x 2 x i64> @llvm.vp.shl.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> splat (i64 2), <vscale x 2 x i1> %m, i32 %vl)
@@ -275,8 +277,9 @@ define <vscale x 4 x i32> @vwsll_vv_nxv4i32_sext(<vscale x 4 x i16> %a, <vscale
 ; CHECK-ZVBB-LABEL: vwsll_vv_nxv4i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
   %y = sext <vscale x 4 x i16> %b to <vscale x 4 x i32>
@@ -296,8 +299,9 @@ define <vscale x 4 x i32> @vwsll_vv_nxv4i32_zext(<vscale x 4 x i16> %a, <vscale
 ; CHECK-ZVBB-LABEL: vwsll_vv_nxv4i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
   %y = zext <vscale x 4 x i16> %b to <vscale x 4 x i32>
@@ -323,15 +327,15 @@ define <vscale x 4 x i32> @vwsll_vx_i64_nxv4i32(<vscale x 4 x i16> %a, i64 %b, <
 ; CHECK-ZVBB32-LABEL: vwsll_vx_i64_nxv4i32:
 ; CHECK-ZVBB32:       # %bb.0:
 ; CHECK-ZVBB32-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
-; CHECK-ZVBB32-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB32-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB32-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB32-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB32-NEXT:    ret
 ;
 ; CHECK-ZVBB64-LABEL: vwsll_vx_i64_nxv4i32:
 ; CHECK-ZVBB64:       # %bb.0:
 ; CHECK-ZVBB64-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-ZVBB64-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB64-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB64-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB64-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB64-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -352,8 +356,8 @@ define <vscale x 4 x i32> @vwsll_vx_i32_nxv4i32(<vscale x 4 x i16> %a, i32 %b, <
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv4i32:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -366,18 +370,18 @@ define <vscale x 4 x i32> @vwsll_vx_i16_nxv4i32_sext(<vscale x 4 x i16> %a, i16
 ; CHECK-LABEL: vwsll_vx_i16_nxv4i32_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv4i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -391,18 +395,18 @@ define <vscale x 4 x i32> @vwsll_vx_i16_nxv4i32_zext(<vscale x 4 x i16> %a, i16
 ; CHECK-LABEL: vwsll_vx_i16_nxv4i32_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv4i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -416,18 +420,18 @@ define <vscale x 4 x i32> @vwsll_vx_i8_nxv4i32_sext(<vscale x 4 x i16> %a, i8 %b
 ; CHECK-LABEL: vwsll_vx_i8_nxv4i32_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vsext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv4i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -441,18 +445,18 @@ define <vscale x 4 x i32> @vwsll_vx_i8_nxv4i32_zext(<vscale x 4 x i16> %a, i8 %b
 ; CHECK-LABEL: vwsll_vx_i8_nxv4i32_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf4 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv4i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -465,17 +469,17 @@ define <vscale x 4 x i32> @vwsll_vx_i8_nxv4i32_zext(<vscale x 4 x i16> %a, i8 %b
 define <vscale x 4 x i32> @vwsll_vi_nxv4i32(<vscale x 4 x i16> %a, <vscale x 4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vwsll_vi_nxv4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 4
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a1, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_nxv4i32:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vi v8, v10, 2, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
   %z = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> splat (i32 2), <vscale x 4 x i1> %m, i32 %vl)
@@ -501,8 +505,9 @@ define <vscale x 8 x i16> @vwsll_vv_nxv8i16_sext(<vscale x 8 x i8> %a, <vscale x
 ; CHECK-ZVBB-LABEL: vwsll_vv_nxv8i16_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %y = sext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -522,8 +527,9 @@ define <vscale x 8 x i16> @vwsll_vv_nxv8i16_zext(<vscale x 8 x i8> %a, <vscale x
 ; CHECK-ZVBB-LABEL: vwsll_vv_nxv8i16_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v9
+; CHECK-ZVBB-NEXT:    vmv1r.v v11, v8
+; CHECK-ZVBB-NEXT:    vwsll.vv v8, v11, v10, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %y = zext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -549,15 +555,15 @@ define <vscale x 8 x i16> @vwsll_vx_i64_nxv8i16(<vscale x 8 x i8> %a, i64 %b, <v
 ; CHECK-ZVBB32-LABEL: vwsll_vx_i64_nxv8i16:
 ; CHECK-ZVBB32:       # %bb.0:
 ; CHECK-ZVBB32-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
-; CHECK-ZVBB32-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB32-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB32-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB32-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB32-NEXT:    ret
 ;
 ; CHECK-ZVBB64-LABEL: vwsll_vx_i64_nxv8i16:
 ; CHECK-ZVBB64:       # %bb.0:
 ; CHECK-ZVBB64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-ZVBB64-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB64-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB64-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB64-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB64-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> poison, <vscale x 8 x i32> zeroinitializer
@@ -578,8 +584,8 @@ define <vscale x 8 x i16> @vwsll_vx_i32_nxv8i16(<vscale x 8 x i8> %a, i32 %b, <v
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv8i16:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
@@ -600,8 +606,8 @@ define <vscale x 8 x i16> @vwsll_vx_i16_nxv8i16(<vscale x 8 x i8> %a, i16 %b, <v
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv8i16:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -614,18 +620,18 @@ define <vscale x 8 x i16> @vwsll_vx_i8_nxv8i16_sext(<vscale x 8 x i8> %a, i8 %b,
 ; CHECK-LABEL: vwsll_vx_i8_nxv8i16_sext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv8i16_sext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -639,18 +645,18 @@ define <vscale x 8 x i16> @vwsll_vx_i8_nxv8i16_zext(<vscale x 8 x i8> %a, i8 %b,
 ; CHECK-LABEL: vwsll_vx_i8_nxv8i16_zext:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    vsll.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    vsll.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv8i16_zext:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vx v8, v10, a0, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -663,17 +669,17 @@ define <vscale x 8 x i16> @vwsll_vx_i8_nxv8i16_zext(<vscale x 8 x i8> %a, i8 %b,
 define <vscale x 8 x i16> @vwsll_vi_nxv8i16(<vscale x 8 x i8> %a, <vscale x 8 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vwsll_vi_nxv8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 4
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vx v10, v8, a1, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vwmulu.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_nxv8i16:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2, v0.t
-; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
+; CHECK-ZVBB-NEXT:    vmv1r.v v10, v8
+; CHECK-ZVBB-NEXT:    vwsll.vi v8, v10, 2, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %z = call <vscale x 8 x i16> @llvm.vp.shl.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> splat (i16 2), <vscale x 8 x i1> %m, i32 %vl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll.ll
index 9711afb2aafbf..14af93408d3d8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll.ll
@@ -158,8 +158,9 @@ define <vscale x 8 x i16> @intrinsic_vwsll_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8
 ; CHECK-LABEL: intrinsic_vwsll_vv_nxv8i16_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwsll.nxv8i16.nxv8i8.nxv8i8(
@@ -206,8 +207,9 @@ define <vscale x 16 x i16> @intrinsic_vwsll_vv_nxv16i16_nxv16i8_nxv16i8(<vscale
 ; CHECK-LABEL: intrinsic_vwsll_vv_nxv16i16_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwsll.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwsll.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwsll.nxv16i16.nxv16i8.nxv16i8(
@@ -254,8 +256,9 @@ define <vscale x 32 x i16> @intrinsic_vwsll_vv_nxv32i16_nxv32i8_nxv32i8(<vscale
 ; CHECK-LABEL: intrinsic_vwsll_vv_nxv32i16_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwsll.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwsll.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwsll.nxv32i16.nxv32i8.nxv32i8(
@@ -398,8 +401,9 @@ define <vscale x 4 x i32> @intrinsic_vwsll_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwsll_vv_nxv4i32_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwsll.nxv4i32.nxv4i16.nxv4i16(
@@ -446,8 +450,9 @@ define <vscale x 8 x i32> @intrinsic_vwsll_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwsll_vv_nxv8i32_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwsll.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwsll.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwsll.nxv8i32.nxv8i16.nxv8i16(
@@ -494,8 +499,9 @@ define <vscale x 16 x i32> @intrinsic_vwsll_vv_nxv16i32_nxv16i16_nxv16i16(<vscal
 ; CHECK-LABEL: intrinsic_vwsll_vv_nxv16i32_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vwsll.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwsll.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwsll.nxv16i32.nxv16i16.nxv16i16(
@@ -590,8 +596,9 @@ define <vscale x 2 x i64> @intrinsic_vwsll_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwsll_vv_nxv2i64_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwsll.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwsll.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwsll.nxv2i64.nxv2i32.nxv2i32(
@@ -638,8 +645,9 @@ define <vscale x 4 x i64> @intrinsic_vwsll_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwsll_vv_nxv4i64_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwsll.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwsll.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwsll.nxv4i64.nxv4i32.nxv4i32(
@@ -686,8 +694,9 @@ define <vscale x 8 x i64> @intrinsic_vwsll_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwsll_vv_nxv8i64_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwsll.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwsll.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwsll.nxv8i64.nxv8i32.nxv8i32(
@@ -878,8 +887,8 @@ define <vscale x 8 x i16> @intrinsic_vwsll_vx_nxv8i16_nxv8i8(<vscale x 8 x i8> %
 ; CHECK-LABEL: intrinsic_vwsll_vx_nxv8i16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwsll.nxv8i16.nxv8i8(
@@ -926,8 +935,8 @@ define <vscale x 16 x i16> @intrinsic_vwsll_vx_nxv16i16_nxv16i8(<vscale x 16 x i
 ; CHECK-LABEL: intrinsic_vwsll_vx_nxv16i16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vwsll.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsll.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwsll.nxv16i16.nxv16i8(
@@ -974,8 +983,8 @@ define <vscale x 32 x i16> @intrinsic_vwsll_vx_nxv32i16_nxv32i8(<vscale x 32 x i
 ; CHECK-LABEL: intrinsic_vwsll_vx_nxv32i16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vwsll.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsll.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwsll.nxv32i16.nxv32i8(
@@ -1118,8 +1127,8 @@ define <vscale x 4 x i32> @intrinsic_vwsll_vx_nxv4i32_nxv4i16(<vscale x 4 x i16>
 ; CHECK-LABEL: intrinsic_vwsll_vx_nxv4i32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwsll.nxv4i32.nxv4i16(
@@ -1166,8 +1175,8 @@ define <vscale x 8 x i32> @intrinsic_vwsll_vx_nxv8i32_nxv8i16(<vscale x 8 x i16>
 ; CHECK-LABEL: intrinsic_vwsll_vx_nxv8i32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vwsll.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsll.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwsll.nxv8i32.nxv8i16(
@@ -1214,8 +1223,8 @@ define <vscale x 16 x i32> @intrinsic_vwsll_vx_nxv16i32_nxv16i16(<vscale x 16 x
 ; CHECK-LABEL: intrinsic_vwsll_vx_nxv16i32_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vwsll.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsll.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwsll.nxv16i32.nxv16i16(
@@ -1310,8 +1319,8 @@ define <vscale x 2 x i64> @intrinsic_vwsll_vx_nxv2i64_nxv2i32(<vscale x 2 x i32>
 ; CHECK-LABEL: intrinsic_vwsll_vx_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwsll.nxv2i64.nxv2i32(
@@ -1358,8 +1367,8 @@ define <vscale x 4 x i64> @intrinsic_vwsll_vx_nxv4i64_nxv4i32(<vscale x 4 x i32>
 ; CHECK-LABEL: intrinsic_vwsll_vx_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vwsll.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsll.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwsll.nxv4i64.nxv4i32(
@@ -1406,8 +1415,8 @@ define <vscale x 8 x i64> @intrinsic_vwsll_vx_nxv8i64_nxv8i32(<vscale x 8 x i32>
 ; CHECK-LABEL: intrinsic_vwsll_vx_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vwsll.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsll.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwsll.nxv8i64.nxv8i32(
@@ -1550,8 +1559,8 @@ define <vscale x 8 x i16> @intrinsic_vwsll_vi_nxv8i16_nxv8i8(<vscale x 8 x i8> %
 ; CHECK-LABEL: intrinsic_vwsll_vi_nxv8i16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwsll.vi v10, v8, 1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsll.vi v8, v10, 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwsll.nxv8i16.nxv8i8(
@@ -1584,8 +1593,8 @@ define <vscale x 16 x i16> @intrinsic_vwsll_vi_nxv16i16_nxv16i8(<vscale x 16 x i
 ; CHECK-LABEL: intrinsic_vwsll_vi_nxv16i16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwsll.vi v12, v8, 1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsll.vi v8, v12, 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwsll.nxv16i16.nxv16i8(
@@ -1618,8 +1627,8 @@ define <vscale x 32 x i16> @intrinsic_vwsll_vi_nxv32i16_nxv32i8(<vscale x 32 x i
 ; CHECK-LABEL: intrinsic_vwsll_vi_nxv32i16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwsll.vi v16, v8, 1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsll.vi v8, v16, 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwsll.nxv32i16.nxv32i8(
@@ -1720,8 +1729,8 @@ define <vscale x 4 x i32> @intrinsic_vwsll_vi_nxv4i32_nxv4i16(<vscale x 4 x i16>
 ; CHECK-LABEL: intrinsic_vwsll_vi_nxv4i32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwsll.vi v10, v8, 1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsll.vi v8, v10, 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwsll.nxv4i32.nxv4i16(
@@ -1754,8 +1763,8 @@ define <vscale x 8 x i32> @intrinsic_vwsll_vi_nxv8i32_nxv8i16(<vscale x 8 x i16>
 ; CHECK-LABEL: intrinsic_vwsll_vi_nxv8i32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwsll.vi v12, v8, 1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsll.vi v8, v12, 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwsll.nxv8i32.nxv8i16(
@@ -1788,8 +1797,8 @@ define <vscale x 16 x i32> @intrinsic_vwsll_vi_nxv16i32_nxv16i16(<vscale x 16 x
 ; CHECK-LABEL: intrinsic_vwsll_vi_nxv16i32_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vwsll.vi v16, v8, 1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsll.vi v8, v16, 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwsll.nxv16i32.nxv16i16(
@@ -1856,8 +1865,8 @@ define <vscale x 2 x i64> @intrinsic_vwsll_vi_nxv2i64_nxv2i32(<vscale x 2 x i32>
 ; CHECK-LABEL: intrinsic_vwsll_vi_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwsll.vi v10, v8, 1
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsll.vi v8, v10, 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwsll.nxv2i64.nxv2i32(
@@ -1890,8 +1899,8 @@ define <vscale x 4 x i64> @intrinsic_vwsll_vi_nxv4i64_nxv4i32(<vscale x 4 x i32>
 ; CHECK-LABEL: intrinsic_vwsll_vi_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwsll.vi v12, v8, 1
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsll.vi v8, v12, 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwsll.nxv4i64.nxv4i32(
@@ -1924,8 +1933,8 @@ define <vscale x 8 x i64> @intrinsic_vwsll_vi_nxv8i64_nxv8i32(<vscale x 8 x i32>
 ; CHECK-LABEL: intrinsic_vwsll_vi_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwsll.vi v16, v8, 1
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsll.vi v8, v16, 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwsll.nxv8i64.nxv8i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll
index 04ece9d94880c..22f9f97373415 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll
@@ -5,12 +5,13 @@
 define <vscale x 8 x i64> @vwsub_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x 8 x i64> %y) {
 ; CHECK-LABEL: vwsub_wv_mask_v8i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v24, v8
 ; CHECK-NEXT:    li a0, 42
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, tu, mu
-; CHECK-NEXT:    vwsub.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, tu, mu
+; CHECK-NEXT:    vwsub.wv v8, v8, v24, v0.t
 ; CHECK-NEXT:    ret
     %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
@@ -22,12 +23,13 @@ define <vscale x 8 x i64> @vwsub_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 define <vscale x 8 x i64> @vwsubu_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x 8 x i64> %y) {
 ; CHECK-LABEL: vwsubu_wv_mask_v8i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v24, v8
 ; CHECK-NEXT:    li a0, 42
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, tu, mu
-; CHECK-NEXT:    vwsubu.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, tu, mu
+; CHECK-NEXT:    vwsubu.wv v8, v8, v24, v0.t
 ; CHECK-NEXT:    ret
     %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
@@ -39,13 +41,13 @@ define <vscale x 8 x i64> @vwsubu_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 define <vscale x 8 x i64> @vwsubu_vv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y) {
 ; CHECK-LABEL: vwsubu_vv_mask_v8i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v12
 ; CHECK-NEXT:    li a0, 42
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT:    vwsubu.vv v16, v12, v8
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmerge.vvm v20, v12, v8, v0
+; CHECK-NEXT:    vwsubu.vv v8, v16, v20
 ; CHECK-NEXT:    ret
     %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsub-sdnode.ll
index 852814d648bfc..631c8c3ca48f9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsub-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsub-sdnode.ll
@@ -110,8 +110,9 @@ define <vscale x 2 x i64> @vwsub_vv_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, <vsc
 ; CHECK-LABEL: vwsub_vv_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwsub.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwsub.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %vc = sext <vscale x 2 x i32> %va to <vscale x 2 x i64>
   %vd = sext <vscale x 2 x i32> %vb to <vscale x 2 x i64>
@@ -123,8 +124,9 @@ define <vscale x 2 x i64> @vwsubu_vv_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, <vs
 ; CHECK-LABEL: vwsubu_vv_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwsubu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwsubu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %vc = zext <vscale x 2 x i32> %va to <vscale x 2 x i64>
   %vd = zext <vscale x 2 x i32> %vb to <vscale x 2 x i64>
@@ -136,8 +138,8 @@ define <vscale x 2 x i64> @vwsub_vx_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, i32
 ; CHECK-LABEL: vwsub_vx_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwsub.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsub.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -151,8 +153,8 @@ define <vscale x 2 x i64> @vwsubu_vx_nxv2i64_nxv2i32(<vscale x 2 x i32> %va, i32
 ; CHECK-LABEL: vwsubu_vx_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwsubu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsubu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -214,8 +216,9 @@ define <vscale x 4 x i64> @vwsub_vv_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, <vsc
 ; CHECK-LABEL: vwsub_vv_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwsub.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwsub.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %vc = sext <vscale x 4 x i32> %va to <vscale x 4 x i64>
   %vd = sext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
@@ -227,8 +230,9 @@ define <vscale x 4 x i64> @vwsubu_vv_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, <vs
 ; CHECK-LABEL: vwsubu_vv_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwsubu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwsubu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
   %vc = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
   %vd = zext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
@@ -240,8 +244,8 @@ define <vscale x 4 x i64> @vwsub_vx_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, i32
 ; CHECK-LABEL: vwsub_vx_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwsub.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsub.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -255,8 +259,8 @@ define <vscale x 4 x i64> @vwsubu_vx_nxv4i64_nxv4i32(<vscale x 4 x i32> %va, i32
 ; CHECK-LABEL: vwsubu_vx_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vwsubu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsubu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -318,8 +322,9 @@ define <vscale x 8 x i64> @vwsub_vv_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, <vsc
 ; CHECK-LABEL: vwsub_vv_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwsub.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwsub.vv v8, v20, v16
 ; CHECK-NEXT:    ret
   %vc = sext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   %vd = sext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
@@ -331,8 +336,9 @@ define <vscale x 8 x i64> @vwsubu_vv_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, <vs
 ; CHECK-LABEL: vwsubu_vv_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwsubu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwsubu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
   %vc = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   %vd = zext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
@@ -344,8 +350,8 @@ define <vscale x 8 x i64> @vwsub_vx_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, i32
 ; CHECK-LABEL: vwsub_vx_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwsub.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsub.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
@@ -359,8 +365,8 @@ define <vscale x 8 x i64> @vwsubu_vx_nxv8i64_nxv8i32(<vscale x 8 x i32> %va, i32
 ; CHECK-LABEL: vwsubu_vx_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vwsubu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsubu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
@@ -744,10 +750,10 @@ define <vscale x 4 x i64> @vwsub_wx_nxv4i64_nxv4i16(<vscale x 4 x i64> %va, i16
 ; CHECK-LABEL: vwsub_wx_nxv4i64_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmv.v.x v14, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v14, v12
-; CHECK-NEXT:    vwsub.wv v8, v8, v14
+; CHECK-NEXT:    vsext.vf2 v12, v14
+; CHECK-NEXT:    vwsub.wv v8, v8, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> poison, i16 %b, i16 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -760,10 +766,10 @@ define <vscale x 4 x i64> @vwsubu_wx_nxv4i64_nxv4i16(<vscale x 4 x i64> %va, i16
 ; CHECK-LABEL: vwsubu_wx_nxv4i64_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmv.v.x v14, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v14, v12
-; CHECK-NEXT:    vwsubu.wv v8, v8, v14
+; CHECK-NEXT:    vzext.vf2 v12, v14
+; CHECK-NEXT:    vwsubu.wv v8, v8, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> poison, i16 %b, i16 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -862,10 +868,10 @@ define <vscale x 8 x i64> @vwsub_wx_nxv8i64_nxv8i16(<vscale x 8 x i64> %va, i16
 ; CHECK-LABEL: vwsub_wx_nxv8i64_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v20, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v20, v16
-; CHECK-NEXT:    vwsub.wv v8, v8, v20
+; CHECK-NEXT:    vsext.vf2 v16, v20
+; CHECK-NEXT:    vwsub.wv v8, v8, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i16> poison, i16 %b, i16 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -878,10 +884,10 @@ define <vscale x 8 x i64> @vwsubu_wx_nxv8i64_nxv8i16(<vscale x 8 x i64> %va, i16
 ; CHECK-LABEL: vwsubu_wx_nxv8i64_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v20, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v20, v16
-; CHECK-NEXT:    vwsubu.wv v8, v8, v20
+; CHECK-NEXT:    vzext.vf2 v16, v20
+; CHECK-NEXT:    vwsubu.wv v8, v8, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i16> poison, i16 %b, i16 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -1216,10 +1222,10 @@ define <vscale x 4 x i64> @vwsub_wx_nxv4i64_nxv4i8(<vscale x 4 x i64> %va, i8 %b
 ; CHECK-LABEL: vwsub_wx_nxv4i64_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmv.v.x v14, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsext.vf4 v14, v12
-; CHECK-NEXT:    vwsub.wv v8, v8, v14
+; CHECK-NEXT:    vsext.vf4 v12, v14
+; CHECK-NEXT:    vwsub.wv v8, v8, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> poison, i8 %b, i8 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -1232,10 +1238,10 @@ define <vscale x 4 x i64> @vwsubu_wx_nxv4i64_nxv4i8(<vscale x 4 x i64> %va, i8 %
 ; CHECK-LABEL: vwsubu_wx_nxv4i64_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vmv.v.x v14, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf4 v14, v12
-; CHECK-NEXT:    vwsubu.wv v8, v8, v14
+; CHECK-NEXT:    vzext.vf4 v12, v14
+; CHECK-NEXT:    vwsubu.wv v8, v8, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> poison, i8 %b, i8 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -1334,10 +1340,10 @@ define <vscale x 8 x i64> @vwsub_wx_nxv8i64_nxv8i8(<vscale x 8 x i64> %va, i8 %b
 ; CHECK-LABEL: vwsub_wx_nxv8i64_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v20, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsext.vf4 v20, v16
-; CHECK-NEXT:    vwsub.wv v8, v8, v20
+; CHECK-NEXT:    vsext.vf4 v16, v20
+; CHECK-NEXT:    vwsub.wv v8, v8, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> poison, i8 %b, i8 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -1350,10 +1356,10 @@ define <vscale x 8 x i64> @vwsubu_wx_nxv8i64_nxv8i8(<vscale x 8 x i64> %va, i8 %
 ; CHECK-LABEL: vwsubu_wx_nxv8i64_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v20, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf4 v20, v16
-; CHECK-NEXT:    vwsubu.wv v8, v8, v20
+; CHECK-NEXT:    vzext.vf4 v16, v20
+; CHECK-NEXT:    vwsubu.wv v8, v8, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> poison, i8 %b, i8 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/vwsub.ll
index c22d86cdef1d7..24e03df413bfd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsub.ll
@@ -158,8 +158,9 @@ define <vscale x 8 x i16> @intrinsic_vwsub_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8
 ; CHECK-LABEL: intrinsic_vwsub_vv_nxv8i16_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwsub.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwsub.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwsub.nxv8i16.nxv8i8.nxv8i8(
@@ -206,8 +207,9 @@ define <vscale x 16 x i16> @intrinsic_vwsub_vv_nxv16i16_nxv16i8_nxv16i8(<vscale
 ; CHECK-LABEL: intrinsic_vwsub_vv_nxv16i16_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwsub.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwsub.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwsub.nxv16i16.nxv16i8.nxv16i8(
@@ -254,8 +256,9 @@ define <vscale x 32 x i16> @intrinsic_vwsub_vv_nxv32i16_nxv32i8_nxv32i8(<vscale
 ; CHECK-LABEL: intrinsic_vwsub_vv_nxv32i16_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwsub.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwsub.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwsub.nxv32i16.nxv32i8.nxv32i8(
@@ -398,8 +401,9 @@ define <vscale x 4 x i32> @intrinsic_vwsub_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwsub_vv_nxv4i32_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwsub.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwsub.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwsub.nxv4i32.nxv4i16.nxv4i16(
@@ -446,8 +450,9 @@ define <vscale x 8 x i32> @intrinsic_vwsub_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwsub_vv_nxv8i32_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwsub.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwsub.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwsub.nxv8i32.nxv8i16.nxv8i16(
@@ -494,8 +499,9 @@ define <vscale x 16 x i32> @intrinsic_vwsub_vv_nxv16i32_nxv16i16_nxv16i16(<vscal
 ; CHECK-LABEL: intrinsic_vwsub_vv_nxv16i32_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vwsub.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwsub.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwsub.nxv16i32.nxv16i16.nxv16i16(
@@ -590,8 +596,9 @@ define <vscale x 2 x i64> @intrinsic_vwsub_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwsub_vv_nxv2i64_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwsub.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwsub.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwsub.nxv2i64.nxv2i32.nxv2i32(
@@ -638,8 +645,9 @@ define <vscale x 4 x i64> @intrinsic_vwsub_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwsub_vv_nxv4i64_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwsub.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwsub.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwsub.nxv4i64.nxv4i32.nxv4i32(
@@ -686,8 +694,9 @@ define <vscale x 8 x i64> @intrinsic_vwsub_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwsub_vv_nxv8i64_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwsub.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwsub.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwsub.nxv8i64.nxv8i32.nxv8i32(
@@ -878,8 +887,8 @@ define <vscale x 8 x i16> @intrinsic_vwsub_vx_nxv8i16_nxv8i8_i8(<vscale x 8 x i8
 ; CHECK-LABEL: intrinsic_vwsub_vx_nxv8i16_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vwsub.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsub.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwsub.nxv8i16.nxv8i8.i8(
@@ -926,8 +935,8 @@ define <vscale x 16 x i16> @intrinsic_vwsub_vx_nxv16i16_nxv16i8_i8(<vscale x 16
 ; CHECK-LABEL: intrinsic_vwsub_vx_nxv16i16_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vwsub.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsub.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwsub.nxv16i16.nxv16i8.i8(
@@ -974,8 +983,8 @@ define <vscale x 32 x i16> @intrinsic_vwsub_vx_nxv32i16_nxv32i8_i8(<vscale x 32
 ; CHECK-LABEL: intrinsic_vwsub_vx_nxv32i16_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vwsub.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsub.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwsub.nxv32i16.nxv32i8.i8(
@@ -1118,8 +1127,8 @@ define <vscale x 4 x i32> @intrinsic_vwsub_vx_nxv4i32_nxv4i16_i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vwsub_vx_nxv4i32_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vwsub.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsub.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwsub.nxv4i32.nxv4i16.i16(
@@ -1166,8 +1175,8 @@ define <vscale x 8 x i32> @intrinsic_vwsub_vx_nxv8i32_nxv8i16_i16(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwsub_vx_nxv8i32_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vwsub.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsub.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwsub.nxv8i32.nxv8i16.i16(
@@ -1214,8 +1223,8 @@ define <vscale x 16 x i32> @intrinsic_vwsub_vx_nxv16i32_nxv16i16_i16(<vscale x 1
 ; CHECK-LABEL: intrinsic_vwsub_vx_nxv16i32_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vwsub.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsub.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwsub.nxv16i32.nxv16i16.i16(
@@ -1310,8 +1319,8 @@ define <vscale x 2 x i64> @intrinsic_vwsub_vx_nxv2i64_nxv2i32_i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vwsub_vx_nxv2i64_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vwsub.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsub.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwsub.nxv2i64.nxv2i32.i32(
@@ -1358,8 +1367,8 @@ define <vscale x 4 x i64> @intrinsic_vwsub_vx_nxv4i64_nxv4i32_i32(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vwsub_vx_nxv4i64_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vwsub.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsub.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwsub.nxv4i64.nxv4i32.i32(
@@ -1406,8 +1415,8 @@ define <vscale x 8 x i64> @intrinsic_vwsub_vx_nxv8i64_nxv8i32_i32(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwsub_vx_nxv8i64_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vwsub.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsub.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwsub.nxv8i64.nxv8i32.i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsub.w.ll b/llvm/test/CodeGen/RISCV/rvv/vwsub.w.ll
index 08ed452776f95..1bc6103d298b2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsub.w.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsub.w.ll
@@ -1982,8 +1982,8 @@ define <vscale x 8 x i16> @intrinsic_vwsub.w_wv_untie_nxv8i16_nxv8i16_nxv8i8(<vs
 ; CHECK-LABEL: intrinsic_vwsub.w_wv_untie_nxv8i16_nxv8i16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwsub.wv v12, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vwsub.wv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwsub.w.nxv8i16.nxv8i8(
@@ -1999,8 +1999,8 @@ define <vscale x 16 x i16> @intrinsic_vwsub.w_wv_untie_nxv16i16_nxv16i16_nxv16i8
 ; CHECK-LABEL: intrinsic_vwsub.w_wv_untie_nxv16i16_nxv16i16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwsub.wv v16, v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vwsub.wv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwsub.w.nxv16i16.nxv16i8(
@@ -2016,8 +2016,8 @@ define <vscale x 32 x i16> @intrinsic_vwsub.w_wv_untie_nxv32i16_nxv32i16_nxv32i8
 ; CHECK-LABEL: intrinsic_vwsub.w_wv_untie_nxv32i16_nxv32i16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwsub.wv v24, v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    vwsub.wv v8, v16, v24
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwsub.w.nxv32i16.nxv32i8(
@@ -2067,8 +2067,8 @@ define <vscale x 4 x i32> @intrinsic_vwsub.w_wv_untie_nxv4i32_nxv4i32_nxv4i16(<v
 ; CHECK-LABEL: intrinsic_vwsub.w_wv_untie_nxv4i32_nxv4i32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwsub.wv v12, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vwsub.wv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwsub.w.nxv4i32.nxv4i16(
@@ -2084,8 +2084,8 @@ define <vscale x 8 x i32> @intrinsic_vwsub.w_wv_untie_nxv8i32_nxv8i32_nxv8i16(<v
 ; CHECK-LABEL: intrinsic_vwsub.w_wv_untie_nxv8i32_nxv8i32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwsub.wv v16, v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vwsub.wv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwsub.w.nxv8i32.nxv8i16(
@@ -2118,8 +2118,8 @@ define <vscale x 2 x i64> @intrinsic_vwsub.w_wv_untie_nxv2i64_nxv2i64_nxv2i32(<v
 ; CHECK-LABEL: intrinsic_vwsub.w_wv_untie_nxv2i64_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwsub.wv v12, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vwsub.wv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwsub.w.nxv2i64.nxv2i32(
@@ -2135,8 +2135,8 @@ define <vscale x 4 x i64> @intrinsic_vwsub.w_wv_untie_nxv4i64_nxv4i64_nxv4i32(<v
 ; CHECK-LABEL: intrinsic_vwsub.w_wv_untie_nxv4i64_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwsub.wv v16, v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vwsub.wv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwsub.w.nxv4i64.nxv4i32(
@@ -2152,8 +2152,8 @@ define <vscale x 8 x i64> @intrinsic_vwsub.w_wv_untie_nxv8i64_nxv8i64_nxv8i32(<v
 ; CHECK-LABEL: intrinsic_vwsub.w_wv_untie_nxv8i64_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwsub.wv v24, v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    vwsub.wv v8, v16, v24
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwsub.w.nxv8i64.nxv8i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/vwsubu.ll
index 858526d3f9b34..d849b0aaffc63 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsubu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsubu.ll
@@ -158,8 +158,9 @@ define <vscale x 8 x i16> @intrinsic_vwsubu_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8
 ; CHECK-LABEL: intrinsic_vwsubu_vv_nxv8i16_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwsubu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwsubu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwsubu.nxv8i16.nxv8i8.nxv8i8(
@@ -206,8 +207,9 @@ define <vscale x 16 x i16> @intrinsic_vwsubu_vv_nxv16i16_nxv16i8_nxv16i8(<vscale
 ; CHECK-LABEL: intrinsic_vwsubu_vv_nxv16i16_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwsubu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwsubu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwsubu.nxv16i16.nxv16i8.nxv16i8(
@@ -254,8 +256,9 @@ define <vscale x 32 x i16> @intrinsic_vwsubu_vv_nxv32i16_nxv32i8_nxv32i8(<vscale
 ; CHECK-LABEL: intrinsic_vwsubu_vv_nxv32i16_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwsubu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwsubu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwsubu.nxv32i16.nxv32i8.nxv32i8(
@@ -398,8 +401,9 @@ define <vscale x 4 x i32> @intrinsic_vwsubu_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwsubu_vv_nxv4i32_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwsubu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwsubu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwsubu.nxv4i32.nxv4i16.nxv4i16(
@@ -446,8 +450,9 @@ define <vscale x 8 x i32> @intrinsic_vwsubu_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwsubu_vv_nxv8i32_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwsubu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwsubu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwsubu.nxv8i32.nxv8i16.nxv8i16(
@@ -494,8 +499,9 @@ define <vscale x 16 x i32> @intrinsic_vwsubu_vv_nxv16i32_nxv16i16_nxv16i16(<vsca
 ; CHECK-LABEL: intrinsic_vwsubu_vv_nxv16i32_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vwsubu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwsubu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwsubu.nxv16i32.nxv16i16.nxv16i16(
@@ -590,8 +596,9 @@ define <vscale x 2 x i64> @intrinsic_vwsubu_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwsubu_vv_nxv2i64_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwsubu.vv v10, v8, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vwsubu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwsubu.nxv2i64.nxv2i32.nxv2i32(
@@ -638,8 +645,9 @@ define <vscale x 4 x i64> @intrinsic_vwsubu_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwsubu_vv_nxv4i64_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwsubu.vv v12, v8, v10
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    vwsubu.vv v8, v14, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwsubu.nxv4i64.nxv4i32.nxv4i32(
@@ -686,8 +694,9 @@ define <vscale x 8 x i64> @intrinsic_vwsubu_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x
 ; CHECK-LABEL: intrinsic_vwsubu_vv_nxv8i64_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwsubu.vv v16, v8, v12
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    vwsubu.vv v8, v20, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwsubu.nxv8i64.nxv8i32.nxv8i32(
@@ -878,8 +887,8 @@ define <vscale x 8 x i16> @intrinsic_vwsubu_vx_nxv8i16_nxv8i8_i8(<vscale x 8 x i
 ; CHECK-LABEL: intrinsic_vwsubu_vx_nxv8i16_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vwsubu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsubu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwsubu.nxv8i16.nxv8i8.i8(
@@ -926,8 +935,8 @@ define <vscale x 16 x i16> @intrinsic_vwsubu_vx_nxv16i16_nxv16i8_i8(<vscale x 16
 ; CHECK-LABEL: intrinsic_vwsubu_vx_nxv16i16_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vwsubu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsubu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwsubu.nxv16i16.nxv16i8.i8(
@@ -974,8 +983,8 @@ define <vscale x 32 x i16> @intrinsic_vwsubu_vx_nxv32i16_nxv32i8_i8(<vscale x 32
 ; CHECK-LABEL: intrinsic_vwsubu_vx_nxv32i16_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vwsubu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsubu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwsubu.nxv32i16.nxv32i8.i8(
@@ -1118,8 +1127,8 @@ define <vscale x 4 x i32> @intrinsic_vwsubu_vx_nxv4i32_nxv4i16_i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vwsubu_vx_nxv4i32_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vwsubu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsubu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwsubu.nxv4i32.nxv4i16.i16(
@@ -1166,8 +1175,8 @@ define <vscale x 8 x i32> @intrinsic_vwsubu_vx_nxv8i32_nxv8i16_i16(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwsubu_vx_nxv8i32_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vwsubu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsubu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwsubu.nxv8i32.nxv8i16.i16(
@@ -1214,8 +1223,8 @@ define <vscale x 16 x i32> @intrinsic_vwsubu_vx_nxv16i32_nxv16i16_i16(<vscale x
 ; CHECK-LABEL: intrinsic_vwsubu_vx_nxv16i32_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vwsubu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsubu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vwsubu.nxv16i32.nxv16i16.i16(
@@ -1310,8 +1319,8 @@ define <vscale x 2 x i64> @intrinsic_vwsubu_vx_nxv2i64_nxv2i32_i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vwsubu_vx_nxv2i64_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vwsubu.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vwsubu.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwsubu.nxv2i64.nxv2i32.i32(
@@ -1358,8 +1367,8 @@ define <vscale x 4 x i64> @intrinsic_vwsubu_vx_nxv4i64_nxv4i32_i32(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vwsubu_vx_nxv4i64_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vwsubu.vx v12, v8, a0
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vwsubu.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwsubu.nxv4i64.nxv4i32.i32(
@@ -1406,8 +1415,8 @@ define <vscale x 8 x i64> @intrinsic_vwsubu_vx_nxv8i64_nxv8i32_i32(<vscale x 8 x
 ; CHECK-LABEL: intrinsic_vwsubu_vx_nxv8i64_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vwsubu.vx v16, v8, a0
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vwsubu.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwsubu.nxv8i64.nxv8i32.i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsubu.w.ll b/llvm/test/CodeGen/RISCV/rvv/vwsubu.w.ll
index 4676be60f64e1..01b3287e4ed31 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsubu.w.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsubu.w.ll
@@ -1982,8 +1982,8 @@ define <vscale x 8 x i16> @intrinsic_vwsubu.w_wv_untie_nxv8i16_nxv8i16_nxv8i8(<v
 ; CHECK-LABEL: intrinsic_vwsubu.w_wv_untie_nxv8i16_nxv8i16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vwsubu.wv v12, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vwsubu.wv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vwsubu.w.nxv8i16.nxv8i8(
@@ -1999,8 +1999,8 @@ define <vscale x 16 x i16> @intrinsic_vwsubu.w_wv_untie_nxv16i16_nxv16i16_nxv16i
 ; CHECK-LABEL: intrinsic_vwsubu.w_wv_untie_nxv16i16_nxv16i16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vwsubu.wv v16, v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vwsubu.wv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vwsubu.w.nxv16i16.nxv16i8(
@@ -2016,8 +2016,8 @@ define <vscale x 32 x i16> @intrinsic_vwsubu.w_wv_untie_nxv32i16_nxv32i16_nxv32i
 ; CHECK-LABEL: intrinsic_vwsubu.w_wv_untie_nxv32i16_nxv32i16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vwsubu.wv v24, v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    vwsubu.wv v8, v16, v24
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vwsubu.w.nxv32i16.nxv32i8(
@@ -2067,8 +2067,8 @@ define <vscale x 4 x i32> @intrinsic_vwsubu.w_wv_untie_nxv4i32_nxv4i32_nxv4i16(<
 ; CHECK-LABEL: intrinsic_vwsubu.w_wv_untie_nxv4i32_nxv4i32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vwsubu.wv v12, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vwsubu.wv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vwsubu.w.nxv4i32.nxv4i16(
@@ -2084,8 +2084,8 @@ define <vscale x 8 x i32> @intrinsic_vwsubu.w_wv_untie_nxv8i32_nxv8i32_nxv8i16(<
 ; CHECK-LABEL: intrinsic_vwsubu.w_wv_untie_nxv8i32_nxv8i32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vwsubu.wv v16, v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vwsubu.wv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vwsubu.w.nxv8i32.nxv8i16(
@@ -2118,8 +2118,8 @@ define <vscale x 2 x i64> @intrinsic_vwsubu.w_wv_untie_nxv2i64_nxv2i64_nxv2i32(<
 ; CHECK-LABEL: intrinsic_vwsubu.w_wv_untie_nxv2i64_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vwsubu.wv v12, v10, v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vwsubu.wv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vwsubu.w.nxv2i64.nxv2i32(
@@ -2135,8 +2135,8 @@ define <vscale x 4 x i64> @intrinsic_vwsubu.w_wv_untie_nxv4i64_nxv4i64_nxv4i32(<
 ; CHECK-LABEL: intrinsic_vwsubu.w_wv_untie_nxv4i64_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vwsubu.wv v16, v12, v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vwsubu.wv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vwsubu.w.nxv4i64.nxv4i32(
@@ -2152,8 +2152,8 @@ define <vscale x 8 x i64> @intrinsic_vwsubu.w_wv_untie_nxv8i64_nxv8i64_nxv8i32(<
 ; CHECK-LABEL: intrinsic_vwsubu.w_wv_untie_nxv8i64_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vwsubu.wv v24, v16, v8
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    vwsubu.wv v8, v16, v24
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vwsubu.w.nxv8i64.nxv8i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
index 934d7eb43ac2a..34337b1af1df5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
@@ -56,8 +56,8 @@ define <vscale x 2 x i64> @vzext_nxv2i8_nxv2i64(<vscale x 2 x i8> %a, <vscale x
 ; CHECK-LABEL: vzext_nxv2i8_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf8 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf8 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> %m, i32 %vl)
   ret <vscale x 2 x i64> %v
@@ -67,8 +67,8 @@ define <vscale x 2 x i64> @vzext_nxv2i8_nxv2i64_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-LABEL: vzext_nxv2i8_nxv2i64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf8 v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
@@ -104,8 +104,8 @@ define <vscale x 2 x i64> @vzext_nxv2i16_nxv2i64(<vscale x 2 x i16> %a, <vscale
 ; CHECK-LABEL: vzext_nxv2i16_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf4 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf4 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
   ret <vscale x 2 x i64> %v
@@ -115,8 +115,8 @@ define <vscale x 2 x i64> @vzext_nxv2i16_nxv2i64_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-LABEL: vzext_nxv2i16_nxv2i64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf4 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf4 v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
@@ -128,8 +128,8 @@ define <vscale x 2 x i64> @vzext_nxv2i32_nxv2i64(<vscale x 2 x i32> %a, <vscale
 ; CHECK-LABEL: vzext_nxv2i32_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf2 v8, v10, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> %m, i32 %vl)
   ret <vscale x 2 x i64> %v
@@ -139,8 +139,8 @@ define <vscale x 2 x i64> @vzext_nxv2i32_nxv2i64_unmasked(<vscale x 2 x i32> %a,
 ; CHECK-LABEL: vzext_nxv2i32_nxv2i64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext.ll b/llvm/test/CodeGen/RISCV/rvv/vzext.ll
index bf246b2694436..a883041000a9a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vzext.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vzext.ll
@@ -57,8 +57,8 @@ define <vscale x 2 x i64> @intrinsic_vzext_vf8_nxv2i64(<vscale x 2 x i8> %0, iXL
 ; CHECK-LABEL: intrinsic_vzext_vf8_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf8 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf8 v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vzext.nxv2i64.nxv2i8(
@@ -101,8 +101,8 @@ define <vscale x 4 x i64> @intrinsic_vzext_vf8_nxv4i64(<vscale x 4 x i8> %0, iXL
 ; CHECK-LABEL: intrinsic_vzext_vf8_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vzext.vf8 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vzext.vf8 v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vzext.nxv4i64.nxv4i8(
@@ -145,8 +145,8 @@ define <vscale x 8 x i64> @intrinsic_vzext_vf8_nxv8i64(<vscale x 8 x i8> %0, iXL
 ; CHECK-LABEL: intrinsic_vzext_vf8_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vzext.vf8 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vzext.vf8 v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vzext.nxv8i64.nxv8i8(
@@ -233,8 +233,8 @@ define <vscale x 2 x i64> @intrinsic_vzext_vf4_nxv2i64(<vscale x 2 x i16> %0, iX
 ; CHECK-LABEL: intrinsic_vzext_vf4_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf4 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf4 v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vzext.nxv2i64.nxv2i16(
@@ -277,8 +277,8 @@ define <vscale x 4 x i64> @intrinsic_vzext_vf4_nxv4i64(<vscale x 4 x i16> %0, iX
 ; CHECK-LABEL: intrinsic_vzext_vf4_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vzext.vf4 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vzext.vf4 v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vzext.nxv4i64.nxv4i16(
@@ -321,8 +321,8 @@ define <vscale x 8 x i64> @intrinsic_vzext_vf4_nxv8i64(<vscale x 8 x i16> %0, iX
 ; CHECK-LABEL: intrinsic_vzext_vf4_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vzext.vf4 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vzext.vf4 v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vzext.nxv8i64.nxv8i16(
@@ -453,8 +453,8 @@ define <vscale x 4 x i32> @intrinsic_vzext_vf4_nxv4i32(<vscale x 4 x i8> %0, iXL
 ; CHECK-LABEL: intrinsic_vzext_vf4_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf4 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf4 v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vzext.nxv4i32.nxv4i8(
@@ -497,8 +497,8 @@ define <vscale x 8 x i32> @intrinsic_vzext_vf4_nxv8i32(<vscale x 8 x i8> %0, iXL
 ; CHECK-LABEL: intrinsic_vzext_vf4_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf4 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vzext.vf4 v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vzext.nxv8i32.nxv8i8(
@@ -541,8 +541,8 @@ define <vscale x 16 x i32> @intrinsic_vzext_vf4_nxv16i32(<vscale x 16 x i8> %0,
 ; CHECK-LABEL: intrinsic_vzext_vf4_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vzext.vf4 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vzext.vf4 v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vzext.nxv16i32.nxv16i8(
@@ -629,8 +629,8 @@ define <vscale x 2 x i64> @intrinsic_vzext_vf2_nxv2i64(<vscale x 2 x i32> %0, iX
 ; CHECK-LABEL: intrinsic_vzext_vf2_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf2 v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vzext.nxv2i64.nxv2i32(
@@ -673,8 +673,8 @@ define <vscale x 4 x i64> @intrinsic_vzext_vf2_nxv4i64(<vscale x 4 x i32> %0, iX
 ; CHECK-LABEL: intrinsic_vzext_vf2_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vzext.vf2 v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vzext.nxv4i64.nxv4i32(
@@ -717,8 +717,8 @@ define <vscale x 8 x i64> @intrinsic_vzext_vf2_nxv8i64(<vscale x 8 x i32> %0, iX
 ; CHECK-LABEL: intrinsic_vzext_vf2_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vzext.vf2 v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vzext.nxv8i64.nxv8i32(
@@ -849,8 +849,8 @@ define <vscale x 4 x i32> @intrinsic_vzext_vf2_nxv4i32(<vscale x 4 x i16> %0, iX
 ; CHECK-LABEL: intrinsic_vzext_vf2_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf2 v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vzext.nxv4i32.nxv4i16(
@@ -893,8 +893,8 @@ define <vscale x 8 x i32> @intrinsic_vzext_vf2_nxv8i32(<vscale x 8 x i16> %0, iX
 ; CHECK-LABEL: intrinsic_vzext_vf2_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vzext.vf2 v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vzext.nxv8i32.nxv8i16(
@@ -937,8 +937,8 @@ define <vscale x 16 x i32> @intrinsic_vzext_vf2_nxv16i32(<vscale x 16 x i16> %0,
 ; CHECK-LABEL: intrinsic_vzext_vf2_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vzext.vf2 v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vzext.nxv16i32.nxv16i16(
@@ -1113,8 +1113,8 @@ define <vscale x 8 x i16> @intrinsic_vzext_vf2_nxv8i16(<vscale x 8 x i8> %0, iXL
 ; CHECK-LABEL: intrinsic_vzext_vf2_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vzext.vf2 v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vzext.nxv8i16.nxv8i8(
@@ -1157,8 +1157,8 @@ define <vscale x 16 x i16> @intrinsic_vzext_vf2_nxv16i16(<vscale x 16 x i8> %0,
 ; CHECK-LABEL: intrinsic_vzext_vf2_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vzext.vf2 v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vzext.nxv16i16.nxv16i8(
@@ -1201,8 +1201,8 @@ define <vscale x 32 x i16> @intrinsic_vzext_vf2_nxv32i16(<vscale x 32 x i8> %0,
 ; CHECK-LABEL: intrinsic_vzext_vf2_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vzext.vf2 v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vzext.nxv32i16.nxv32i8(
diff --git a/llvm/test/CodeGen/RISCV/rvv/zvbb-demanded-bits.ll b/llvm/test/CodeGen/RISCV/rvv/zvbb-demanded-bits.ll
index 47622a62d0eaf..107d59670c4c9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/zvbb-demanded-bits.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/zvbb-demanded-bits.ll
@@ -33,10 +33,10 @@ define <vscale x 1 x i8> @vror_vx_nxv1i8(<vscale x 1 x i8> %a, i8 %b) {
 define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_zext(<vscale x 2 x i32> %a, i8 %b) {
 ; CHECK-LABEL: vwsll_vx_i8_nxv2i64_zext:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addiw a0, a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vwsll.vx v10, v8, a0
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    addiw a0, a0, 1
+; CHECK-NEXT:    vwsll.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %s = add i8 %b, 1
   %head = insertelement <vscale x 2 x i8> poison, i8 %s, i32 0
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 42c87c9660dc9..daeb306b7e85f 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -667,17 +667,17 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    vsetivli zero, 3, e8, mf2, ta, ma
 ; RV32MV-NEXT:    vmv.v.i v10, 1
 ; RV32MV-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32MV-NEXT:    vmv.v.i v11, 0
+; RV32MV-NEXT:    vmv.v.i v12, 0
 ; RV32MV-NEXT:    vsetivli zero, 3, e8, mf2, tu, ma
-; RV32MV-NEXT:    vslideup.vi v11, v10, 2
+; RV32MV-NEXT:    vslideup.vi v12, v10, 2
 ; RV32MV-NEXT:    vsetivli zero, 5, e8, mf2, ta, ma
 ; RV32MV-NEXT:    vmv.v.i v10, 2
 ; RV32MV-NEXT:    vsetvli zero, zero, e8, mf2, tu, ma
-; RV32MV-NEXT:    vslideup.vi v11, v10, 4
+; RV32MV-NEXT:    vslideup.vi v12, v10, 4
 ; RV32MV-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32MV-NEXT:    vsext.vf4 v12, v11
+; RV32MV-NEXT:    vsext.vf4 v10, v12
 ; RV32MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32MV-NEXT:    vmsne.vv v0, v8, v12
+; RV32MV-NEXT:    vmsne.vv v0, v8, v10
 ; RV32MV-NEXT:    vmv.v.i v8, 0
 ; RV32MV-NEXT:    vmerge.vim v8, v8, -1, v0
 ; RV32MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -735,7 +735,7 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64MV-NEXT:    ld a6, %lo(.LCPI3_2)(a6)
 ; RV64MV-NEXT:    addi a7, a7, 256
 ; RV64MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64MV-NEXT:    vmv.s.x v8, a7
+; RV64MV-NEXT:    vmv.s.x v10, a7
 ; RV64MV-NEXT:    slli a3, a3, 32
 ; RV64MV-NEXT:    srli a7, a1, 2
 ; RV64MV-NEXT:    or a3, a2, a3
@@ -766,16 +766,16 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64MV-NEXT:    add a1, a1, a5
 ; RV64MV-NEXT:    li a4, -1
 ; RV64MV-NEXT:    srli a4, a4, 31
-; RV64MV-NEXT:    vsext.vf8 v10, v8
+; RV64MV-NEXT:    vsext.vf8 v8, v10
 ; RV64MV-NEXT:    add a3, a3, a6
 ; RV64MV-NEXT:    slli a6, a6, 2
-; RV64MV-NEXT:    vmv.v.x v8, a1
+; RV64MV-NEXT:    vmv.v.x v10, a1
 ; RV64MV-NEXT:    add a3, a3, a6
-; RV64MV-NEXT:    vslide1down.vx v8, v8, a2
-; RV64MV-NEXT:    vslide1down.vx v8, v8, a3
-; RV64MV-NEXT:    vslidedown.vi v8, v8, 1
-; RV64MV-NEXT:    vand.vx v8, v8, a4
-; RV64MV-NEXT:    vmsne.vv v0, v8, v10
+; RV64MV-NEXT:    vslide1down.vx v10, v10, a2
+; RV64MV-NEXT:    vslide1down.vx v10, v10, a3
+; RV64MV-NEXT:    vslidedown.vi v10, v10, 1
+; RV64MV-NEXT:    vand.vx v10, v10, a4
+; RV64MV-NEXT:    vmsne.vv v0, v10, v8
 ; RV64MV-NEXT:    vmv.v.i v8, 0
 ; RV64MV-NEXT:    vmerge.vim v8, v8, -1, v0
 ; RV64MV-NEXT:    vslidedown.vi v10, v8, 2

>From 6cf0b43d9aec4741f1f1a2024a76090af2eb8d71 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 13 Mar 2025 16:51:15 -0700
Subject: [PATCH 2/2] Address review comment

---
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td    |   9 +-
 .../rvv/fixed-vectors-interleaved-access.ll   |  16 +-
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  |  28 +-
 .../CodeGen/RISCV/rvv/vector-interleave.ll    |  68 +--
 llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll    |  12 +-
 llvm/test/CodeGen/RISCV/rvv/vloxseg-rv32.ll   | 416 +++++++++---------
 llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll   | 416 +++++++++---------
 llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll   | 416 +++++++++---------
 llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll   | 416 +++++++++---------
 9 files changed, 897 insertions(+), 900 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 1e0541e667895..3e4d71db995a0 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -718,6 +718,9 @@ class VReg<list<ValueType> regTypes, dag regList, int Vlmul>
                          regList> {
   let IsVRegClass = 1;
   let VLMul = Vlmul;
+
+  // Prefer to allocate high LMUL registers first.
+  let AllocationPriority = !if(!gt(Vlmul, 1), Vlmul, 0);
 }
 
 defvar VMaskVTs = [vbool1_t, vbool2_t, vbool4_t, vbool8_t, vbool16_t,
@@ -752,24 +755,18 @@ def VR : VReg<!listconcat(VM1VTs, VMaskVTs),
 
 def VRNoV0 : VReg<!listconcat(VM1VTs, VMaskVTs), (sub VR, V0), 1>;
 
-let AllocationPriority = 2 in
 def VRM2 : VReg<VM2VTs, (add (sequence "V%uM2", 8, 31, 2),
                              (sequence "V%uM2", 6, 0, 2)), 2>;
 
-let AllocationPriority = 2 in
 def VRM2NoV0 : VReg<VM2VTs, (sub VRM2, V0M2), 2>;
 
-let AllocationPriority = 4 in
 def VRM4 : VReg<VM4VTs, (add V8M4, V12M4, V16M4, V20M4,
                              V24M4, V28M4, V4M4, V0M4), 4>;
 
-let AllocationPriority = 4 in
 def VRM4NoV0 : VReg<VM4VTs, (sub VRM4, V0M4), 4>;
 
-let AllocationPriority = 8 in
 def VRM8 : VReg<VM8VTs, (add V8M8, V16M8, V24M8, V0M8), 8>;
 
-let AllocationPriority = 8 in
 def VRM8NoV0 : VReg<VM8VTs, (sub VRM8, V0M8), 8>;
 
 def VMV0 : VReg<VMaskVTs, (add V0), 1>;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 954a5889f31ba..61bafd3cfeeef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -1311,17 +1311,17 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) {
 ; CHECK-LABEL: load_factor4_one_active_storeback_full:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
+; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v12, 4
-; CHECK-NEXT:    vmv1r.v v17, v12
-; CHECK-NEXT:    vmv1r.v v18, v8
+; CHECK-NEXT:    vslidedown.vi v12, v8, 4
+; CHECK-NEXT:    vmv1r.v v13, v8
+; CHECK-NEXT:    vmv1r.v v14, v12
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v12, 8
-; CHECK-NEXT:    vmv1r.v v19, v8
-; CHECK-NEXT:    vslidedown.vi v20, v12, 12
+; CHECK-NEXT:    vslidedown.vi v16, v8, 8
+; CHECK-NEXT:    vmv1r.v v15, v16
+; CHECK-NEXT:    vslidedown.vi v16, v8, 12
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vsseg4e32.v v17, (a0)
+; CHECK-NEXT:    vsseg4e32.v v13, (a0)
 ; CHECK-NEXT:    ret
   %interleaved.vec = load <16 x i32>, ptr %ptr
   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index 319dfb037c15a..c3a51986a7f8c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -609,21 +609,21 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    vmv1r.v v12, v13
 ; CHECK-NEXT:    vs8r.v v8, (a1)
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v12, (a0)
-; CHECK-NEXT:    vlseg5e8.v v18, (a1)
-; CHECK-NEXT:    vmv2r.v v8, v12
-; CHECK-NEXT:    vmv2r.v v10, v14
-; CHECK-NEXT:    vmv1r.v v9, v18
+; CHECK-NEXT:    vlseg5e8.v v8, (a0)
+; CHECK-NEXT:    vlseg5e8.v v14, (a1)
+; CHECK-NEXT:    vmv2r.v v20, v8
+; CHECK-NEXT:    vmv2r.v v22, v10
+; CHECK-NEXT:    vmv1r.v v21, v14
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vmv1r.v v18, v13
-; CHECK-NEXT:    vmsne.vi v8, v18, 0
-; CHECK-NEXT:    vmv1r.v v11, v20
-; CHECK-NEXT:    vmsne.vi v9, v10, 0
-; CHECK-NEXT:    vmv1r.v v20, v15
-; CHECK-NEXT:    vmsne.vi v10, v20, 0
-; CHECK-NEXT:    vmv1r.v v17, v22
-; CHECK-NEXT:    vmsne.vi v11, v16, 0
+; CHECK-NEXT:    vmsne.vi v0, v20, 0
+; CHECK-NEXT:    vmv1r.v v14, v9
+; CHECK-NEXT:    vmsne.vi v8, v14, 0
+; CHECK-NEXT:    vmv1r.v v23, v16
+; CHECK-NEXT:    vmsne.vi v9, v22, 0
+; CHECK-NEXT:    vmv1r.v v16, v11
+; CHECK-NEXT:    vmsne.vi v10, v16, 0
+; CHECK-NEXT:    vmv1r.v v13, v18
+; CHECK-NEXT:    vmsne.vi v11, v12, 0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index a41ed52343099..b0eac8bdf48dd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -2080,37 +2080,37 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmerge.vim v18, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v16, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v22, v14, 1, v0
 ; CHECK-NEXT:    add a3, a4, a2
 ; CHECK-NEXT:    srli a1, a2, 2
 ; CHECK-NEXT:    add a5, a0, a2
-; CHECK-NEXT:    vmv2r.v v2, v18
+; CHECK-NEXT:    vmv4r.v v24, v16
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v20, v14, 1, v0
+; CHECK-NEXT:    vmerge.vim v18, v14, 1, v0
 ; CHECK-NEXT:    add a6, a3, a2
-; CHECK-NEXT:    vmv1r.v v3, v16
+; CHECK-NEXT:    vmv1r.v v25, v22
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmerge.vim v8, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v4, v20
+; CHECK-NEXT:    vmv1r.v v26, v18
 ; CHECK-NEXT:    vmv1r.v v0, v11
-; CHECK-NEXT:    vmerge.vim v22, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v5, v8
+; CHECK-NEXT:    vmerge.vim v20, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v27, v8
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vim v10, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v6, v22
-; CHECK-NEXT:    vmv1r.v v20, v17
+; CHECK-NEXT:    vmv1r.v v28, v20
+; CHECK-NEXT:    vmv1r.v v18, v23
 ; CHECK-NEXT:    add a7, a6, a2
-; CHECK-NEXT:    vmv1r.v v7, v10
-; CHECK-NEXT:    vmv1r.v v22, v9
+; CHECK-NEXT:    vmv1r.v v29, v10
+; CHECK-NEXT:    vmv1r.v v20, v9
 ; CHECK-NEXT:    vmv1r.v v0, v13
-; CHECK-NEXT:    vmerge.vim v8, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v24, v11
+; CHECK-NEXT:    vmerge.vim v30, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v22, v11
 ; CHECK-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vsseg7e8.v v2, (a4)
-; CHECK-NEXT:    vmv1r.v v25, v9
-; CHECK-NEXT:    vsseg7e8.v v19, (a0)
+; CHECK-NEXT:    vsseg7e8.v v24, (a4)
+; CHECK-NEXT:    vmv1r.v v23, v31
+; CHECK-NEXT:    vsseg7e8.v v17, (a0)
 ; CHECK-NEXT:    vl1r.v v8, (a6)
 ; CHECK-NEXT:    add a6, a7, a2
 ; CHECK-NEXT:    vl1r.v v10, (a4)
@@ -2174,37 +2174,37 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; ZVBB-NEXT:    add a0, sp, a0
 ; ZVBB-NEXT:    addi a0, a0, 16
 ; ZVBB-NEXT:    csrr a2, vlenb
-; ZVBB-NEXT:    vmerge.vim v18, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v0, v8
 ; ZVBB-NEXT:    vmerge.vim v16, v14, 1, v0
+; ZVBB-NEXT:    vmv1r.v v0, v8
+; ZVBB-NEXT:    vmerge.vim v22, v14, 1, v0
 ; ZVBB-NEXT:    add a3, a4, a2
 ; ZVBB-NEXT:    srli a1, a2, 2
 ; ZVBB-NEXT:    add a5, a0, a2
-; ZVBB-NEXT:    vmv2r.v v2, v18
+; ZVBB-NEXT:    vmv4r.v v24, v16
 ; ZVBB-NEXT:    vmv1r.v v0, v9
-; ZVBB-NEXT:    vmerge.vim v20, v14, 1, v0
+; ZVBB-NEXT:    vmerge.vim v18, v14, 1, v0
 ; ZVBB-NEXT:    add a6, a3, a2
-; ZVBB-NEXT:    vmv1r.v v3, v16
+; ZVBB-NEXT:    vmv1r.v v25, v22
 ; ZVBB-NEXT:    vmv1r.v v0, v10
 ; ZVBB-NEXT:    vmerge.vim v8, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v4, v20
+; ZVBB-NEXT:    vmv1r.v v26, v18
 ; ZVBB-NEXT:    vmv1r.v v0, v11
-; ZVBB-NEXT:    vmerge.vim v22, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v5, v8
+; ZVBB-NEXT:    vmerge.vim v20, v14, 1, v0
+; ZVBB-NEXT:    vmv1r.v v27, v8
 ; ZVBB-NEXT:    vmv1r.v v0, v12
 ; ZVBB-NEXT:    vmerge.vim v10, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v6, v22
-; ZVBB-NEXT:    vmv1r.v v20, v17
+; ZVBB-NEXT:    vmv1r.v v28, v20
+; ZVBB-NEXT:    vmv1r.v v18, v23
 ; ZVBB-NEXT:    add a7, a6, a2
-; ZVBB-NEXT:    vmv1r.v v7, v10
-; ZVBB-NEXT:    vmv1r.v v22, v9
+; ZVBB-NEXT:    vmv1r.v v29, v10
+; ZVBB-NEXT:    vmv1r.v v20, v9
 ; ZVBB-NEXT:    vmv1r.v v0, v13
-; ZVBB-NEXT:    vmerge.vim v8, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v24, v11
+; ZVBB-NEXT:    vmerge.vim v30, v14, 1, v0
+; ZVBB-NEXT:    vmv1r.v v22, v11
 ; ZVBB-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; ZVBB-NEXT:    vsseg7e8.v v2, (a4)
-; ZVBB-NEXT:    vmv1r.v v25, v9
-; ZVBB-NEXT:    vsseg7e8.v v19, (a0)
+; ZVBB-NEXT:    vsseg7e8.v v24, (a4)
+; ZVBB-NEXT:    vmv1r.v v23, v31
+; ZVBB-NEXT:    vsseg7e8.v v17, (a0)
 ; ZVBB-NEXT:    vl1r.v v8, (a6)
 ; ZVBB-NEXT:    add a6, a7, a2
 ; ZVBB-NEXT:    vl1r.v v10, (a4)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
index 901f3cd63fa9e..7f38ad5211d2d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
@@ -1162,12 +1162,12 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    li a3, 40
 ; CHECK-NEXT:    mul a2, a2, a3
@@ -1175,7 +1175,7 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a2, 40
 ; CHECK-NEXT:    mul a0, a0, a2
@@ -1191,16 +1191,16 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 40
 ; CHECK-NEXT:    mul a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv32.ll
index 4f7286aeeda1e..66dd191a82ce0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv32.ll
@@ -3025,8 +3025,8 @@ define <vscale x 8 x i16> @test_vloxseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -3038,8 +3038,8 @@ define <vscale x 8 x i16> @test_vloxseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -3112,8 +3112,8 @@ define <vscale x 16 x i16> @test_vloxseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2
 ; CHECK-LABEL: test_vloxseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, i32 %vl, i32 4)
@@ -3125,8 +3125,8 @@ define <vscale x 16 x i16> @test_vloxseg2_mask_nxv16i16_triscv.vector.tuple_nxv3
 ; CHECK-LABEL: test_vloxseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -3460,8 +3460,8 @@ define <vscale x 8 x i16> @test_vloxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vloxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -3473,8 +3473,8 @@ define <vscale x 8 x i16> @test_vloxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -3808,8 +3808,8 @@ define <vscale x 8 x i16> @test_vloxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vloxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -3821,8 +3821,8 @@ define <vscale x 8 x i16> @test_vloxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -5113,8 +5113,8 @@ define <vscale x 4 x i32> @test_vloxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 5)
@@ -5126,8 +5126,8 @@ define <vscale x 4 x i32> @test_vloxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5142,8 +5142,8 @@ define <vscale x 4 x i32> @test_vloxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 5)
@@ -5155,8 +5155,8 @@ define <vscale x 4 x i32> @test_vloxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5200,8 +5200,8 @@ define <vscale x 8 x i32> @test_vloxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 5)
@@ -5213,8 +5213,8 @@ define <vscale x 8 x i32> @test_vloxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5229,8 +5229,8 @@ define <vscale x 8 x i32> @test_vloxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, i32 %vl, i32 5)
@@ -5242,8 +5242,8 @@ define <vscale x 8 x i32> @test_vloxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5461,8 +5461,8 @@ define <vscale x 4 x i32> @test_vloxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vloxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 5)
@@ -5474,8 +5474,8 @@ define <vscale x 4 x i32> @test_vloxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5490,8 +5490,8 @@ define <vscale x 4 x i32> @test_vloxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vloxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 5)
@@ -5503,8 +5503,8 @@ define <vscale x 4 x i32> @test_vloxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5722,8 +5722,8 @@ define <vscale x 4 x i32> @test_vloxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vloxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 5)
@@ -5735,8 +5735,8 @@ define <vscale x 4 x i32> @test_vloxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5751,8 +5751,8 @@ define <vscale x 4 x i32> @test_vloxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vloxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 5)
@@ -5764,8 +5764,8 @@ define <vscale x 4 x i32> @test_vloxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -6592,8 +6592,8 @@ define <vscale x 2 x i64> @test_vloxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, i32 %vl, i32 6)
@@ -6605,8 +6605,8 @@ define <vscale x 2 x i64> @test_vloxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6621,8 +6621,8 @@ define <vscale x 2 x i64> @test_vloxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, i32 %vl, i32 6)
@@ -6634,8 +6634,8 @@ define <vscale x 2 x i64> @test_vloxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6650,8 +6650,8 @@ define <vscale x 2 x i64> @test_vloxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei32.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, i32 %vl, i32 6)
@@ -6663,8 +6663,8 @@ define <vscale x 2 x i64> @test_vloxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei32.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6679,8 +6679,8 @@ define <vscale x 4 x i64> @test_vloxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 6)
@@ -6692,8 +6692,8 @@ define <vscale x 4 x i64> @test_vloxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6708,8 +6708,8 @@ define <vscale x 4 x i64> @test_vloxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 6)
@@ -6721,8 +6721,8 @@ define <vscale x 4 x i64> @test_vloxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6737,8 +6737,8 @@ define <vscale x 4 x i64> @test_vloxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei32.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, i32 %vl, i32 6)
@@ -6750,8 +6750,8 @@ define <vscale x 4 x i64> @test_vloxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei32.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6853,8 +6853,8 @@ define <vscale x 2 x i64> @test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, i32 %vl, i32 6)
@@ -6866,8 +6866,8 @@ define <vscale x 2 x i64> @test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6882,8 +6882,8 @@ define <vscale x 2 x i64> @test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, i32 %vl, i32 6)
@@ -6895,8 +6895,8 @@ define <vscale x 2 x i64> @test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6911,8 +6911,8 @@ define <vscale x 2 x i64> @test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei32.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, i32 %vl, i32 6)
@@ -6924,8 +6924,8 @@ define <vscale x 2 x i64> @test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei32.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -7027,8 +7027,8 @@ define <vscale x 2 x i64> @test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, i32 %vl, i32 6)
@@ -7040,8 +7040,8 @@ define <vscale x 2 x i64> @test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -7056,8 +7056,8 @@ define <vscale x 2 x i64> @test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, i32 %vl, i32 6)
@@ -7069,8 +7069,8 @@ define <vscale x 2 x i64> @test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -7085,8 +7085,8 @@ define <vscale x 2 x i64> @test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei32.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, i32 %vl, i32 6)
@@ -7098,8 +7098,8 @@ define <vscale x 2 x i64> @test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei32.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -7703,8 +7703,8 @@ define <vscale x 8 x half> @test_vloxseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t
 ; CHECK-LABEL: test_vloxseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -7716,8 +7716,8 @@ define <vscale x 8 x half> @test_vloxseg2_mask_nxv8f16_triscv.vector.tuple_nxv16
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -7784,8 +7784,8 @@ define <vscale x 16 x half> @test_vloxseg2_nxv16f16_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vloxseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, i32 %vl, i32 4)
@@ -7797,8 +7797,8 @@ define <vscale x 16 x half> @test_vloxseg2_mask_nxv16f16_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -8108,8 +8108,8 @@ define <vscale x 8 x half> @test_vloxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t
 ; CHECK-LABEL: test_vloxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -8121,8 +8121,8 @@ define <vscale x 8 x half> @test_vloxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16
 ; CHECK-LABEL: test_vloxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -8432,8 +8432,8 @@ define <vscale x 8 x half> @test_vloxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t
 ; CHECK-LABEL: test_vloxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -8445,8 +8445,8 @@ define <vscale x 8 x half> @test_vloxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16
 ; CHECK-LABEL: test_vloxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -9647,8 +9647,8 @@ define <vscale x 4 x float> @test_vloxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2
 ; CHECK-LABEL: test_vloxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 5)
@@ -9660,8 +9660,8 @@ define <vscale x 4 x float> @test_vloxseg2_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -9674,8 +9674,8 @@ define <vscale x 4 x float> @test_vloxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2
 ; CHECK-LABEL: test_vloxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 5)
@@ -9687,8 +9687,8 @@ define <vscale x 4 x float> @test_vloxseg2_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -9728,8 +9728,8 @@ define <vscale x 8 x float> @test_vloxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2
 ; CHECK-LABEL: test_vloxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 5)
@@ -9741,8 +9741,8 @@ define <vscale x 8 x float> @test_vloxseg2_mask_nxv8f32_triscv.vector.tuple_nxv3
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -9755,8 +9755,8 @@ define <vscale x 8 x float> @test_vloxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2
 ; CHECK-LABEL: test_vloxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, i32 %vl, i32 5)
@@ -9768,8 +9768,8 @@ define <vscale x 8 x float> @test_vloxseg2_mask_nxv8f32_triscv.vector.tuple_nxv3
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -9971,8 +9971,8 @@ define <vscale x 4 x float> @test_vloxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3
 ; CHECK-LABEL: test_vloxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 5)
@@ -9984,8 +9984,8 @@ define <vscale x 4 x float> @test_vloxseg3_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vloxseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -9998,8 +9998,8 @@ define <vscale x 4 x float> @test_vloxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3
 ; CHECK-LABEL: test_vloxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 5)
@@ -10011,8 +10011,8 @@ define <vscale x 4 x float> @test_vloxseg3_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vloxseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -10214,8 +10214,8 @@ define <vscale x 4 x float> @test_vloxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4
 ; CHECK-LABEL: test_vloxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 5)
@@ -10227,8 +10227,8 @@ define <vscale x 4 x float> @test_vloxseg4_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vloxseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -10241,8 +10241,8 @@ define <vscale x 4 x float> @test_vloxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4
 ; CHECK-LABEL: test_vloxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 5)
@@ -10254,8 +10254,8 @@ define <vscale x 4 x float> @test_vloxseg4_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vloxseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -11024,8 +11024,8 @@ define <vscale x 2 x double> @test_vloxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, i32 %vl, i32 6)
@@ -11037,8 +11037,8 @@ define <vscale x 2 x double> @test_vloxseg2_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11051,8 +11051,8 @@ define <vscale x 2 x double> @test_vloxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, i32 %vl, i32 6)
@@ -11064,8 +11064,8 @@ define <vscale x 2 x double> @test_vloxseg2_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11078,8 +11078,8 @@ define <vscale x 2 x double> @test_vloxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei32.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, i32 %vl, i32 6)
@@ -11091,8 +11091,8 @@ define <vscale x 2 x double> @test_vloxseg2_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei32.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11105,8 +11105,8 @@ define <vscale x 4 x double> @test_vloxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vloxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 6)
@@ -11118,8 +11118,8 @@ define <vscale x 4 x double> @test_vloxseg2_mask_nxv4f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11132,8 +11132,8 @@ define <vscale x 4 x double> @test_vloxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vloxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 6)
@@ -11145,8 +11145,8 @@ define <vscale x 4 x double> @test_vloxseg2_mask_nxv4f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11159,8 +11159,8 @@ define <vscale x 4 x double> @test_vloxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vloxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei32.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, i32 %vl, i32 6)
@@ -11172,8 +11172,8 @@ define <vscale x 4 x double> @test_vloxseg2_mask_nxv4f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei32.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11267,8 +11267,8 @@ define <vscale x 2 x double> @test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, i32 %vl, i32 6)
@@ -11280,8 +11280,8 @@ define <vscale x 2 x double> @test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11294,8 +11294,8 @@ define <vscale x 2 x double> @test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, i32 %vl, i32 6)
@@ -11307,8 +11307,8 @@ define <vscale x 2 x double> @test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11321,8 +11321,8 @@ define <vscale x 2 x double> @test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei32.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, i32 %vl, i32 6)
@@ -11334,8 +11334,8 @@ define <vscale x 2 x double> @test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei32.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11429,8 +11429,8 @@ define <vscale x 2 x double> @test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, i32 %vl, i32 6)
@@ -11442,8 +11442,8 @@ define <vscale x 2 x double> @test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11456,8 +11456,8 @@ define <vscale x 2 x double> @test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, i32 %vl, i32 6)
@@ -11469,8 +11469,8 @@ define <vscale x 2 x double> @test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11483,8 +11483,8 @@ define <vscale x 2 x double> @test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei32.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, i32 %vl, i32 6)
@@ -11496,8 +11496,8 @@ define <vscale x 2 x double> @test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei32.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -12077,8 +12077,8 @@ define <vscale x 8 x bfloat> @test_vloxseg2_nxv8bf16_triscv.vector.tuple_nxv16i8
 ; CHECK-LABEL: test_vloxseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -12090,8 +12090,8 @@ define <vscale x 8 x bfloat> @test_vloxseg2_mask_nxv8bf16_triscv.vector.tuple_nx
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -12158,8 +12158,8 @@ define <vscale x 16 x bfloat> @test_vloxseg2_nxv16bf16_triscv.vector.tuple_nxv32
 ; CHECK-LABEL: test_vloxseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, i32 %vl, i32 4)
@@ -12171,8 +12171,8 @@ define <vscale x 16 x bfloat> @test_vloxseg2_mask_nxv16bf16_triscv.vector.tuple_
 ; CHECK-LABEL: test_vloxseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -12482,8 +12482,8 @@ define <vscale x 8 x bfloat> @test_vloxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8
 ; CHECK-LABEL: test_vloxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -12495,8 +12495,8 @@ define <vscale x 8 x bfloat> @test_vloxseg3_mask_nxv8bf16_triscv.vector.tuple_nx
 ; CHECK-LABEL: test_vloxseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -12806,8 +12806,8 @@ define <vscale x 8 x bfloat> @test_vloxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8
 ; CHECK-LABEL: test_vloxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -12819,8 +12819,8 @@ define <vscale x 8 x bfloat> @test_vloxseg4_mask_nxv8bf16_triscv.vector.tuple_nx
 ; CHECK-LABEL: test_vloxseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll
index 7b1d545ff9e9e..5b44c1f7e142b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll
@@ -3924,8 +3924,8 @@ define <vscale x 8 x i16> @test_vloxseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -3937,8 +3937,8 @@ define <vscale x 8 x i16> @test_vloxseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -4040,8 +4040,8 @@ define <vscale x 16 x i16> @test_vloxseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2
 ; CHECK-LABEL: test_vloxseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, i64 %vl, i64 4)
@@ -4053,8 +4053,8 @@ define <vscale x 16 x i16> @test_vloxseg2_mask_nxv16i16_triscv.vector.tuple_nxv3
 ; CHECK-LABEL: test_vloxseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -4475,8 +4475,8 @@ define <vscale x 8 x i16> @test_vloxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vloxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -4488,8 +4488,8 @@ define <vscale x 8 x i16> @test_vloxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -4939,8 +4939,8 @@ define <vscale x 8 x i16> @test_vloxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vloxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -4952,8 +4952,8 @@ define <vscale x 8 x i16> @test_vloxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -6679,8 +6679,8 @@ define <vscale x 4 x i32> @test_vloxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 5)
@@ -6692,8 +6692,8 @@ define <vscale x 4 x i32> @test_vloxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -6708,8 +6708,8 @@ define <vscale x 4 x i32> @test_vloxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 5)
@@ -6721,8 +6721,8 @@ define <vscale x 4 x i32> @test_vloxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -6795,8 +6795,8 @@ define <vscale x 8 x i32> @test_vloxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 5)
@@ -6808,8 +6808,8 @@ define <vscale x 8 x i32> @test_vloxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -6824,8 +6824,8 @@ define <vscale x 8 x i32> @test_vloxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, i64 %vl, i64 5)
@@ -6837,8 +6837,8 @@ define <vscale x 8 x i32> @test_vloxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -7143,8 +7143,8 @@ define <vscale x 4 x i32> @test_vloxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vloxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 5)
@@ -7156,8 +7156,8 @@ define <vscale x 4 x i32> @test_vloxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -7172,8 +7172,8 @@ define <vscale x 4 x i32> @test_vloxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vloxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 5)
@@ -7185,8 +7185,8 @@ define <vscale x 4 x i32> @test_vloxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -7491,8 +7491,8 @@ define <vscale x 4 x i32> @test_vloxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vloxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 5)
@@ -7504,8 +7504,8 @@ define <vscale x 4 x i32> @test_vloxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -7520,8 +7520,8 @@ define <vscale x 4 x i32> @test_vloxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vloxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 5)
@@ -7533,8 +7533,8 @@ define <vscale x 4 x i32> @test_vloxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -8651,8 +8651,8 @@ define <vscale x 2 x i64> @test_vloxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, i64 %vl, i64 6)
@@ -8664,8 +8664,8 @@ define <vscale x 2 x i64> @test_vloxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -8680,8 +8680,8 @@ define <vscale x 2 x i64> @test_vloxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, i64 %vl, i64 6)
@@ -8693,8 +8693,8 @@ define <vscale x 2 x i64> @test_vloxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -8709,8 +8709,8 @@ define <vscale x 2 x i64> @test_vloxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei32.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, i64 %vl, i64 6)
@@ -8722,8 +8722,8 @@ define <vscale x 2 x i64> @test_vloxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei32.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -8767,8 +8767,8 @@ define <vscale x 4 x i64> @test_vloxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 6)
@@ -8780,8 +8780,8 @@ define <vscale x 4 x i64> @test_vloxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -8796,8 +8796,8 @@ define <vscale x 4 x i64> @test_vloxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 6)
@@ -8809,8 +8809,8 @@ define <vscale x 4 x i64> @test_vloxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -8825,8 +8825,8 @@ define <vscale x 4 x i64> @test_vloxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vloxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei32.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, i64 %vl, i64 6)
@@ -8838,8 +8838,8 @@ define <vscale x 4 x i64> @test_vloxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei32.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -8999,8 +8999,8 @@ define <vscale x 2 x i64> @test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, i64 %vl, i64 6)
@@ -9012,8 +9012,8 @@ define <vscale x 2 x i64> @test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -9028,8 +9028,8 @@ define <vscale x 2 x i64> @test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, i64 %vl, i64 6)
@@ -9041,8 +9041,8 @@ define <vscale x 2 x i64> @test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -9057,8 +9057,8 @@ define <vscale x 2 x i64> @test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei32.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, i64 %vl, i64 6)
@@ -9070,8 +9070,8 @@ define <vscale x 2 x i64> @test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei32.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -9231,8 +9231,8 @@ define <vscale x 2 x i64> @test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, i64 %vl, i64 6)
@@ -9244,8 +9244,8 @@ define <vscale x 2 x i64> @test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -9260,8 +9260,8 @@ define <vscale x 2 x i64> @test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, i64 %vl, i64 6)
@@ -9273,8 +9273,8 @@ define <vscale x 2 x i64> @test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -9289,8 +9289,8 @@ define <vscale x 2 x i64> @test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei32.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, i64 %vl, i64 6)
@@ -9302,8 +9302,8 @@ define <vscale x 2 x i64> @test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei32.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -10133,8 +10133,8 @@ define <vscale x 8 x half> @test_vloxseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t
 ; CHECK-LABEL: test_vloxseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -10146,8 +10146,8 @@ define <vscale x 8 x half> @test_vloxseg2_mask_nxv8f16_triscv.vector.tuple_nxv16
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -10241,8 +10241,8 @@ define <vscale x 16 x half> @test_vloxseg2_nxv16f16_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vloxseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, i64 %vl, i64 4)
@@ -10254,8 +10254,8 @@ define <vscale x 16 x half> @test_vloxseg2_mask_nxv16f16_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -10646,8 +10646,8 @@ define <vscale x 8 x half> @test_vloxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t
 ; CHECK-LABEL: test_vloxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -10659,8 +10659,8 @@ define <vscale x 8 x half> @test_vloxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16
 ; CHECK-LABEL: test_vloxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -11078,8 +11078,8 @@ define <vscale x 8 x half> @test_vloxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t
 ; CHECK-LABEL: test_vloxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -11091,8 +11091,8 @@ define <vscale x 8 x half> @test_vloxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16
 ; CHECK-LABEL: test_vloxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -12698,8 +12698,8 @@ define <vscale x 4 x float> @test_vloxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2
 ; CHECK-LABEL: test_vloxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 5)
@@ -12711,8 +12711,8 @@ define <vscale x 4 x float> @test_vloxseg2_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -12725,8 +12725,8 @@ define <vscale x 4 x float> @test_vloxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2
 ; CHECK-LABEL: test_vloxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 5)
@@ -12738,8 +12738,8 @@ define <vscale x 4 x float> @test_vloxseg2_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -12806,8 +12806,8 @@ define <vscale x 8 x float> @test_vloxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2
 ; CHECK-LABEL: test_vloxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 5)
@@ -12819,8 +12819,8 @@ define <vscale x 8 x float> @test_vloxseg2_mask_nxv8f32_triscv.vector.tuple_nxv3
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -12833,8 +12833,8 @@ define <vscale x 8 x float> @test_vloxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2
 ; CHECK-LABEL: test_vloxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, i64 %vl, i64 5)
@@ -12846,8 +12846,8 @@ define <vscale x 8 x float> @test_vloxseg2_mask_nxv8f32_triscv.vector.tuple_nxv3
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -13130,8 +13130,8 @@ define <vscale x 4 x float> @test_vloxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3
 ; CHECK-LABEL: test_vloxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 5)
@@ -13143,8 +13143,8 @@ define <vscale x 4 x float> @test_vloxseg3_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vloxseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -13157,8 +13157,8 @@ define <vscale x 4 x float> @test_vloxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3
 ; CHECK-LABEL: test_vloxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 5)
@@ -13170,8 +13170,8 @@ define <vscale x 4 x float> @test_vloxseg3_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vloxseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -13454,8 +13454,8 @@ define <vscale x 4 x float> @test_vloxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4
 ; CHECK-LABEL: test_vloxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 5)
@@ -13467,8 +13467,8 @@ define <vscale x 4 x float> @test_vloxseg4_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vloxseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -13481,8 +13481,8 @@ define <vscale x 4 x float> @test_vloxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4
 ; CHECK-LABEL: test_vloxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 5)
@@ -13494,8 +13494,8 @@ define <vscale x 4 x float> @test_vloxseg4_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vloxseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -14534,8 +14534,8 @@ define <vscale x 2 x double> @test_vloxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, i64 %vl, i64 6)
@@ -14547,8 +14547,8 @@ define <vscale x 2 x double> @test_vloxseg2_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14561,8 +14561,8 @@ define <vscale x 2 x double> @test_vloxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, i64 %vl, i64 6)
@@ -14574,8 +14574,8 @@ define <vscale x 2 x double> @test_vloxseg2_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14588,8 +14588,8 @@ define <vscale x 2 x double> @test_vloxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei32.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, i64 %vl, i64 6)
@@ -14601,8 +14601,8 @@ define <vscale x 2 x double> @test_vloxseg2_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei32.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14642,8 +14642,8 @@ define <vscale x 4 x double> @test_vloxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vloxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 6)
@@ -14655,8 +14655,8 @@ define <vscale x 4 x double> @test_vloxseg2_mask_nxv4f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14669,8 +14669,8 @@ define <vscale x 4 x double> @test_vloxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vloxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 6)
@@ -14682,8 +14682,8 @@ define <vscale x 4 x double> @test_vloxseg2_mask_nxv4f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14696,8 +14696,8 @@ define <vscale x 4 x double> @test_vloxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vloxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei32.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, i64 %vl, i64 6)
@@ -14709,8 +14709,8 @@ define <vscale x 4 x double> @test_vloxseg2_mask_nxv4f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei32.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei32.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14858,8 +14858,8 @@ define <vscale x 2 x double> @test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, i64 %vl, i64 6)
@@ -14871,8 +14871,8 @@ define <vscale x 2 x double> @test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14885,8 +14885,8 @@ define <vscale x 2 x double> @test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, i64 %vl, i64 6)
@@ -14898,8 +14898,8 @@ define <vscale x 2 x double> @test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14912,8 +14912,8 @@ define <vscale x 2 x double> @test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei32.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, i64 %vl, i64 6)
@@ -14925,8 +14925,8 @@ define <vscale x 2 x double> @test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei32.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -15074,8 +15074,8 @@ define <vscale x 2 x double> @test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, i64 %vl, i64 6)
@@ -15087,8 +15087,8 @@ define <vscale x 2 x double> @test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -15101,8 +15101,8 @@ define <vscale x 2 x double> @test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, i64 %vl, i64 6)
@@ -15114,8 +15114,8 @@ define <vscale x 2 x double> @test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -15128,8 +15128,8 @@ define <vscale x 2 x double> @test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei32.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, i64 %vl, i64 6)
@@ -15141,8 +15141,8 @@ define <vscale x 2 x double> @test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei32.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -15938,8 +15938,8 @@ define <vscale x 8 x bfloat> @test_vloxseg2_nxv8bf16_triscv.vector.tuple_nxv16i8
 ; CHECK-LABEL: test_vloxseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -15951,8 +15951,8 @@ define <vscale x 8 x bfloat> @test_vloxseg2_mask_nxv8bf16_triscv.vector.tuple_nx
 ; CHECK-LABEL: test_vloxseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vloxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -16046,8 +16046,8 @@ define <vscale x 16 x bfloat> @test_vloxseg2_nxv16bf16_triscv.vector.tuple_nxv32
 ; CHECK-LABEL: test_vloxseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, i64 %vl, i64 4)
@@ -16059,8 +16059,8 @@ define <vscale x 16 x bfloat> @test_vloxseg2_mask_nxv16bf16_triscv.vector.tuple_
 ; CHECK-LABEL: test_vloxseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vloxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vloxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -16451,8 +16451,8 @@ define <vscale x 8 x bfloat> @test_vloxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8
 ; CHECK-LABEL: test_vloxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -16464,8 +16464,8 @@ define <vscale x 8 x bfloat> @test_vloxseg3_mask_nxv8bf16_triscv.vector.tuple_nx
 ; CHECK-LABEL: test_vloxseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vloxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -16883,8 +16883,8 @@ define <vscale x 8 x bfloat> @test_vloxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8
 ; CHECK-LABEL: test_vloxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -16896,8 +16896,8 @@ define <vscale x 8 x bfloat> @test_vloxseg4_mask_nxv8bf16_triscv.vector.tuple_nx
 ; CHECK-LABEL: test_vloxseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vloxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vloxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll
index d1ca40bcc0db3..e648068fb721b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll
@@ -3025,8 +3025,8 @@ define <vscale x 8 x i16> @test_vluxseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -3038,8 +3038,8 @@ define <vscale x 8 x i16> @test_vluxseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -3112,8 +3112,8 @@ define <vscale x 16 x i16> @test_vluxseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2
 ; CHECK-LABEL: test_vluxseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, i32 %vl, i32 4)
@@ -3125,8 +3125,8 @@ define <vscale x 16 x i16> @test_vluxseg2_mask_nxv16i16_triscv.vector.tuple_nxv3
 ; CHECK-LABEL: test_vluxseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -3460,8 +3460,8 @@ define <vscale x 8 x i16> @test_vluxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vluxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -3473,8 +3473,8 @@ define <vscale x 8 x i16> @test_vluxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -3808,8 +3808,8 @@ define <vscale x 8 x i16> @test_vluxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vluxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -3821,8 +3821,8 @@ define <vscale x 8 x i16> @test_vluxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -5113,8 +5113,8 @@ define <vscale x 4 x i32> @test_vluxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 5)
@@ -5126,8 +5126,8 @@ define <vscale x 4 x i32> @test_vluxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5142,8 +5142,8 @@ define <vscale x 4 x i32> @test_vluxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 5)
@@ -5155,8 +5155,8 @@ define <vscale x 4 x i32> @test_vluxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5200,8 +5200,8 @@ define <vscale x 8 x i32> @test_vluxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 5)
@@ -5213,8 +5213,8 @@ define <vscale x 8 x i32> @test_vluxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5229,8 +5229,8 @@ define <vscale x 8 x i32> @test_vluxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, i32 %vl, i32 5)
@@ -5242,8 +5242,8 @@ define <vscale x 8 x i32> @test_vluxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5461,8 +5461,8 @@ define <vscale x 4 x i32> @test_vluxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vluxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 5)
@@ -5474,8 +5474,8 @@ define <vscale x 4 x i32> @test_vluxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5490,8 +5490,8 @@ define <vscale x 4 x i32> @test_vluxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vluxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 5)
@@ -5503,8 +5503,8 @@ define <vscale x 4 x i32> @test_vluxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5722,8 +5722,8 @@ define <vscale x 4 x i32> @test_vluxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vluxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 5)
@@ -5735,8 +5735,8 @@ define <vscale x 4 x i32> @test_vluxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -5751,8 +5751,8 @@ define <vscale x 4 x i32> @test_vluxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vluxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 5)
@@ -5764,8 +5764,8 @@ define <vscale x 4 x i32> @test_vluxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -6592,8 +6592,8 @@ define <vscale x 2 x i64> @test_vluxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, i32 %vl, i32 6)
@@ -6605,8 +6605,8 @@ define <vscale x 2 x i64> @test_vluxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6621,8 +6621,8 @@ define <vscale x 2 x i64> @test_vluxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, i32 %vl, i32 6)
@@ -6634,8 +6634,8 @@ define <vscale x 2 x i64> @test_vluxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6650,8 +6650,8 @@ define <vscale x 2 x i64> @test_vluxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei32.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, i32 %vl, i32 6)
@@ -6663,8 +6663,8 @@ define <vscale x 2 x i64> @test_vluxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei32.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6679,8 +6679,8 @@ define <vscale x 4 x i64> @test_vluxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 6)
@@ -6692,8 +6692,8 @@ define <vscale x 4 x i64> @test_vluxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6708,8 +6708,8 @@ define <vscale x 4 x i64> @test_vluxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 6)
@@ -6721,8 +6721,8 @@ define <vscale x 4 x i64> @test_vluxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6737,8 +6737,8 @@ define <vscale x 4 x i64> @test_vluxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei32.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, i32 %vl, i32 6)
@@ -6750,8 +6750,8 @@ define <vscale x 4 x i64> @test_vluxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei32.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6853,8 +6853,8 @@ define <vscale x 2 x i64> @test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, i32 %vl, i32 6)
@@ -6866,8 +6866,8 @@ define <vscale x 2 x i64> @test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6882,8 +6882,8 @@ define <vscale x 2 x i64> @test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, i32 %vl, i32 6)
@@ -6895,8 +6895,8 @@ define <vscale x 2 x i64> @test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -6911,8 +6911,8 @@ define <vscale x 2 x i64> @test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei32.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, i32 %vl, i32 6)
@@ -6924,8 +6924,8 @@ define <vscale x 2 x i64> @test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei32.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -7027,8 +7027,8 @@ define <vscale x 2 x i64> @test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, i32 %vl, i32 6)
@@ -7040,8 +7040,8 @@ define <vscale x 2 x i64> @test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -7056,8 +7056,8 @@ define <vscale x 2 x i64> @test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, i32 %vl, i32 6)
@@ -7069,8 +7069,8 @@ define <vscale x 2 x i64> @test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -7085,8 +7085,8 @@ define <vscale x 2 x i64> @test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei32.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, i32 %vl, i32 6)
@@ -7098,8 +7098,8 @@ define <vscale x 2 x i64> @test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei32.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -7703,8 +7703,8 @@ define <vscale x 8 x half> @test_vluxseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t
 ; CHECK-LABEL: test_vluxseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -7716,8 +7716,8 @@ define <vscale x 8 x half> @test_vluxseg2_mask_nxv8f16_triscv.vector.tuple_nxv16
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -7784,8 +7784,8 @@ define <vscale x 16 x half> @test_vluxseg2_nxv16f16_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vluxseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, i32 %vl, i32 4)
@@ -7797,8 +7797,8 @@ define <vscale x 16 x half> @test_vluxseg2_mask_nxv16f16_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -8108,8 +8108,8 @@ define <vscale x 8 x half> @test_vluxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t
 ; CHECK-LABEL: test_vluxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -8121,8 +8121,8 @@ define <vscale x 8 x half> @test_vluxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16
 ; CHECK-LABEL: test_vluxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -8432,8 +8432,8 @@ define <vscale x 8 x half> @test_vluxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t
 ; CHECK-LABEL: test_vluxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -8445,8 +8445,8 @@ define <vscale x 8 x half> @test_vluxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16
 ; CHECK-LABEL: test_vluxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -9647,8 +9647,8 @@ define <vscale x 4 x float> @test_vluxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2
 ; CHECK-LABEL: test_vluxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 5)
@@ -9660,8 +9660,8 @@ define <vscale x 4 x float> @test_vluxseg2_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -9674,8 +9674,8 @@ define <vscale x 4 x float> @test_vluxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2
 ; CHECK-LABEL: test_vluxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 5)
@@ -9687,8 +9687,8 @@ define <vscale x 4 x float> @test_vluxseg2_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -9728,8 +9728,8 @@ define <vscale x 8 x float> @test_vluxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2
 ; CHECK-LABEL: test_vluxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 5)
@@ -9741,8 +9741,8 @@ define <vscale x 8 x float> @test_vluxseg2_mask_nxv8f32_triscv.vector.tuple_nxv3
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -9755,8 +9755,8 @@ define <vscale x 8 x float> @test_vluxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2
 ; CHECK-LABEL: test_vluxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, i32 %vl, i32 5)
@@ -9768,8 +9768,8 @@ define <vscale x 8 x float> @test_vluxseg2_mask_nxv8f32_triscv.vector.tuple_nxv3
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -9971,8 +9971,8 @@ define <vscale x 4 x float> @test_vluxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3
 ; CHECK-LABEL: test_vluxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 5)
@@ -9984,8 +9984,8 @@ define <vscale x 4 x float> @test_vluxseg3_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vluxseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -9998,8 +9998,8 @@ define <vscale x 4 x float> @test_vluxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3
 ; CHECK-LABEL: test_vluxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 5)
@@ -10011,8 +10011,8 @@ define <vscale x 4 x float> @test_vluxseg3_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vluxseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -10214,8 +10214,8 @@ define <vscale x 4 x float> @test_vluxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4
 ; CHECK-LABEL: test_vluxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 5)
@@ -10227,8 +10227,8 @@ define <vscale x 4 x float> @test_vluxseg4_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vluxseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -10241,8 +10241,8 @@ define <vscale x 4 x float> @test_vluxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4
 ; CHECK-LABEL: test_vluxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 5)
@@ -10254,8 +10254,8 @@ define <vscale x 4 x float> @test_vluxseg4_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vluxseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
@@ -11024,8 +11024,8 @@ define <vscale x 2 x double> @test_vluxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, i32 %vl, i32 6)
@@ -11037,8 +11037,8 @@ define <vscale x 2 x double> @test_vluxseg2_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11051,8 +11051,8 @@ define <vscale x 2 x double> @test_vluxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, i32 %vl, i32 6)
@@ -11064,8 +11064,8 @@ define <vscale x 2 x double> @test_vluxseg2_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11078,8 +11078,8 @@ define <vscale x 2 x double> @test_vluxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei32.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, i32 %vl, i32 6)
@@ -11091,8 +11091,8 @@ define <vscale x 2 x double> @test_vluxseg2_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei32.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11105,8 +11105,8 @@ define <vscale x 4 x double> @test_vluxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vluxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i32 %vl, i32 6)
@@ -11118,8 +11118,8 @@ define <vscale x 4 x double> @test_vluxseg2_mask_nxv4f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11132,8 +11132,8 @@ define <vscale x 4 x double> @test_vluxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vluxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i32 %vl, i32 6)
@@ -11145,8 +11145,8 @@ define <vscale x 4 x double> @test_vluxseg2_mask_nxv4f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11159,8 +11159,8 @@ define <vscale x 4 x double> @test_vluxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vluxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei32.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, i32 %vl, i32 6)
@@ -11172,8 +11172,8 @@ define <vscale x 4 x double> @test_vluxseg2_mask_nxv4f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei32.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11267,8 +11267,8 @@ define <vscale x 2 x double> @test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, i32 %vl, i32 6)
@@ -11280,8 +11280,8 @@ define <vscale x 2 x double> @test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11294,8 +11294,8 @@ define <vscale x 2 x double> @test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, i32 %vl, i32 6)
@@ -11307,8 +11307,8 @@ define <vscale x 2 x double> @test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11321,8 +11321,8 @@ define <vscale x 2 x double> @test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei32.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, i32 %vl, i32 6)
@@ -11334,8 +11334,8 @@ define <vscale x 2 x double> @test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei32.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11429,8 +11429,8 @@ define <vscale x 2 x double> @test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, i32 %vl, i32 6)
@@ -11442,8 +11442,8 @@ define <vscale x 2 x double> @test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11456,8 +11456,8 @@ define <vscale x 2 x double> @test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, i32 %vl, i32 6)
@@ -11469,8 +11469,8 @@ define <vscale x 2 x double> @test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -11483,8 +11483,8 @@ define <vscale x 2 x double> @test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei32.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, i32 %vl, i32 6)
@@ -11496,8 +11496,8 @@ define <vscale x 2 x double> @test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei32.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
@@ -12077,8 +12077,8 @@ define <vscale x 8 x bfloat> @test_vluxseg2_nxv8bf16_triscv.vector.tuple_nxv16i8
 ; CHECK-LABEL: test_vluxseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -12090,8 +12090,8 @@ define <vscale x 8 x bfloat> @test_vluxseg2_mask_nxv8bf16_triscv.vector.tuple_nx
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -12158,8 +12158,8 @@ define <vscale x 16 x bfloat> @test_vluxseg2_nxv16bf16_triscv.vector.tuple_nxv32
 ; CHECK-LABEL: test_vluxseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, i32 %vl, i32 4)
@@ -12171,8 +12171,8 @@ define <vscale x 16 x bfloat> @test_vluxseg2_mask_nxv16bf16_triscv.vector.tuple_
 ; CHECK-LABEL: test_vluxseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -12482,8 +12482,8 @@ define <vscale x 8 x bfloat> @test_vluxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8
 ; CHECK-LABEL: test_vluxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -12495,8 +12495,8 @@ define <vscale x 8 x bfloat> @test_vluxseg3_mask_nxv8bf16_triscv.vector.tuple_nx
 ; CHECK-LABEL: test_vluxseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
@@ -12806,8 +12806,8 @@ define <vscale x 8 x bfloat> @test_vluxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8
 ; CHECK-LABEL: test_vluxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, i32 %vl, i32 4)
@@ -12819,8 +12819,8 @@ define <vscale x 8 x bfloat> @test_vluxseg4_mask_nxv8bf16_triscv.vector.tuple_nx
 ; CHECK-LABEL: test_vluxseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll
index 0f7348b474ee4..ca07880f80de5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll
@@ -4015,8 +4015,8 @@ define <vscale x 8 x i16> @test_vluxseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -4028,8 +4028,8 @@ define <vscale x 8 x i16> @test_vluxseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -4131,8 +4131,8 @@ define <vscale x 16 x i16> @test_vluxseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2
 ; CHECK-LABEL: test_vluxseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, i64 %vl, i64 4)
@@ -4144,8 +4144,8 @@ define <vscale x 16 x i16> @test_vluxseg2_mask_nxv16i16_triscv.vector.tuple_nxv3
 ; CHECK-LABEL: test_vluxseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -4566,8 +4566,8 @@ define <vscale x 8 x i16> @test_vluxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vluxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -4579,8 +4579,8 @@ define <vscale x 8 x i16> @test_vluxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -5030,8 +5030,8 @@ define <vscale x 8 x i16> @test_vluxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vluxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -5043,8 +5043,8 @@ define <vscale x 8 x i16> @test_vluxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -6770,8 +6770,8 @@ define <vscale x 4 x i32> @test_vluxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 5)
@@ -6783,8 +6783,8 @@ define <vscale x 4 x i32> @test_vluxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -6799,8 +6799,8 @@ define <vscale x 4 x i32> @test_vluxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 5)
@@ -6812,8 +6812,8 @@ define <vscale x 4 x i32> @test_vluxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -6886,8 +6886,8 @@ define <vscale x 8 x i32> @test_vluxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 5)
@@ -6899,8 +6899,8 @@ define <vscale x 8 x i32> @test_vluxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -6915,8 +6915,8 @@ define <vscale x 8 x i32> @test_vluxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, i64 %vl, i64 5)
@@ -6928,8 +6928,8 @@ define <vscale x 8 x i32> @test_vluxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -7234,8 +7234,8 @@ define <vscale x 4 x i32> @test_vluxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vluxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 5)
@@ -7247,8 +7247,8 @@ define <vscale x 4 x i32> @test_vluxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -7263,8 +7263,8 @@ define <vscale x 4 x i32> @test_vluxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vluxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 5)
@@ -7276,8 +7276,8 @@ define <vscale x 4 x i32> @test_vluxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -7582,8 +7582,8 @@ define <vscale x 4 x i32> @test_vluxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vluxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 5)
@@ -7595,8 +7595,8 @@ define <vscale x 4 x i32> @test_vluxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -7611,8 +7611,8 @@ define <vscale x 4 x i32> @test_vluxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vluxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 5)
@@ -7624,8 +7624,8 @@ define <vscale x 4 x i32> @test_vluxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -8742,8 +8742,8 @@ define <vscale x 2 x i64> @test_vluxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, i64 %vl, i64 6)
@@ -8755,8 +8755,8 @@ define <vscale x 2 x i64> @test_vluxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -8771,8 +8771,8 @@ define <vscale x 2 x i64> @test_vluxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, i64 %vl, i64 6)
@@ -8784,8 +8784,8 @@ define <vscale x 2 x i64> @test_vluxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -8800,8 +8800,8 @@ define <vscale x 2 x i64> @test_vluxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei32.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, i64 %vl, i64 6)
@@ -8813,8 +8813,8 @@ define <vscale x 2 x i64> @test_vluxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei32.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -8858,8 +8858,8 @@ define <vscale x 4 x i64> @test_vluxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 6)
@@ -8871,8 +8871,8 @@ define <vscale x 4 x i64> @test_vluxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -8887,8 +8887,8 @@ define <vscale x 4 x i64> @test_vluxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 6)
@@ -8900,8 +8900,8 @@ define <vscale x 4 x i64> @test_vluxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -8916,8 +8916,8 @@ define <vscale x 4 x i64> @test_vluxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_
 ; CHECK-LABEL: test_vluxseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei32.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, i64 %vl, i64 6)
@@ -8929,8 +8929,8 @@ define <vscale x 4 x i64> @test_vluxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei32.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -9090,8 +9090,8 @@ define <vscale x 2 x i64> @test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, i64 %vl, i64 6)
@@ -9103,8 +9103,8 @@ define <vscale x 2 x i64> @test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -9119,8 +9119,8 @@ define <vscale x 2 x i64> @test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, i64 %vl, i64 6)
@@ -9132,8 +9132,8 @@ define <vscale x 2 x i64> @test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -9148,8 +9148,8 @@ define <vscale x 2 x i64> @test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_
 ; CHECK-LABEL: test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei32.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, i64 %vl, i64 6)
@@ -9161,8 +9161,8 @@ define <vscale x 2 x i64> @test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei32.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -9322,8 +9322,8 @@ define <vscale x 2 x i64> @test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, i64 %vl, i64 6)
@@ -9335,8 +9335,8 @@ define <vscale x 2 x i64> @test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -9351,8 +9351,8 @@ define <vscale x 2 x i64> @test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, i64 %vl, i64 6)
@@ -9364,8 +9364,8 @@ define <vscale x 2 x i64> @test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -9380,8 +9380,8 @@ define <vscale x 2 x i64> @test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_
 ; CHECK-LABEL: test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei32.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, i64 %vl, i64 6)
@@ -9393,8 +9393,8 @@ define <vscale x 2 x i64> @test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i
 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei32.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -10224,8 +10224,8 @@ define <vscale x 8 x half> @test_vluxseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t
 ; CHECK-LABEL: test_vluxseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -10237,8 +10237,8 @@ define <vscale x 8 x half> @test_vluxseg2_mask_nxv8f16_triscv.vector.tuple_nxv16
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -10332,8 +10332,8 @@ define <vscale x 16 x half> @test_vluxseg2_nxv16f16_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vluxseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, i64 %vl, i64 4)
@@ -10345,8 +10345,8 @@ define <vscale x 16 x half> @test_vluxseg2_mask_nxv16f16_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -10737,8 +10737,8 @@ define <vscale x 8 x half> @test_vluxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t
 ; CHECK-LABEL: test_vluxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -10750,8 +10750,8 @@ define <vscale x 8 x half> @test_vluxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16
 ; CHECK-LABEL: test_vluxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -11169,8 +11169,8 @@ define <vscale x 8 x half> @test_vluxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t
 ; CHECK-LABEL: test_vluxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -11182,8 +11182,8 @@ define <vscale x 8 x half> @test_vluxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16
 ; CHECK-LABEL: test_vluxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -12789,8 +12789,8 @@ define <vscale x 4 x float> @test_vluxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2
 ; CHECK-LABEL: test_vluxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 5)
@@ -12802,8 +12802,8 @@ define <vscale x 4 x float> @test_vluxseg2_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -12816,8 +12816,8 @@ define <vscale x 4 x float> @test_vluxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2
 ; CHECK-LABEL: test_vluxseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 5)
@@ -12829,8 +12829,8 @@ define <vscale x 4 x float> @test_vluxseg2_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -12897,8 +12897,8 @@ define <vscale x 8 x float> @test_vluxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2
 ; CHECK-LABEL: test_vluxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 5)
@@ -12910,8 +12910,8 @@ define <vscale x 8 x float> @test_vluxseg2_mask_nxv8f32_triscv.vector.tuple_nxv3
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -12924,8 +12924,8 @@ define <vscale x 8 x float> @test_vluxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2
 ; CHECK-LABEL: test_vluxseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, i64 %vl, i64 5)
@@ -12937,8 +12937,8 @@ define <vscale x 8 x float> @test_vluxseg2_mask_nxv8f32_triscv.vector.tuple_nxv3
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1.nxv8i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i16> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -13221,8 +13221,8 @@ define <vscale x 4 x float> @test_vluxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3
 ; CHECK-LABEL: test_vluxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 5)
@@ -13234,8 +13234,8 @@ define <vscale x 4 x float> @test_vluxseg3_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vluxseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -13248,8 +13248,8 @@ define <vscale x 4 x float> @test_vluxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3
 ; CHECK-LABEL: test_vluxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 5)
@@ -13261,8 +13261,8 @@ define <vscale x 4 x float> @test_vluxseg3_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vluxseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -13545,8 +13545,8 @@ define <vscale x 4 x float> @test_vluxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4
 ; CHECK-LABEL: test_vluxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 5)
@@ -13558,8 +13558,8 @@ define <vscale x 4 x float> @test_vluxseg4_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vluxseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -13572,8 +13572,8 @@ define <vscale x 4 x float> @test_vluxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4
 ; CHECK-LABEL: test_vluxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 5)
@@ -13585,8 +13585,8 @@ define <vscale x 4 x float> @test_vluxseg4_mask_nxv4f32_triscv.vector.tuple_nxv1
 ; CHECK-LABEL: test_vluxseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
@@ -14625,8 +14625,8 @@ define <vscale x 2 x double> @test_vluxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, i64 %vl, i64 6)
@@ -14638,8 +14638,8 @@ define <vscale x 2 x double> @test_vluxseg2_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14652,8 +14652,8 @@ define <vscale x 2 x double> @test_vluxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, i64 %vl, i64 6)
@@ -14665,8 +14665,8 @@ define <vscale x 2 x double> @test_vluxseg2_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei16.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14679,8 +14679,8 @@ define <vscale x 2 x double> @test_vluxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei32.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, i64 %vl, i64 6)
@@ -14692,8 +14692,8 @@ define <vscale x 2 x double> @test_vluxseg2_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei32.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14733,8 +14733,8 @@ define <vscale x 4 x double> @test_vluxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vluxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, i64 %vl, i64 6)
@@ -14746,8 +14746,8 @@ define <vscale x 4 x double> @test_vluxseg2_mask_nxv4f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i8> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14760,8 +14760,8 @@ define <vscale x 4 x double> @test_vluxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vluxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, i64 %vl, i64 6)
@@ -14773,8 +14773,8 @@ define <vscale x 4 x double> @test_vluxseg2_mask_nxv4f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei16.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei16.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i16> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14787,8 +14787,8 @@ define <vscale x 4 x double> @test_vluxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_
 ; CHECK-LABEL: test_vluxseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei32.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, i64 %vl, i64 6)
@@ -14800,8 +14800,8 @@ define <vscale x 4 x double> @test_vluxseg2_mask_nxv4f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei32.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei32.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1.nxv4i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i32> %index, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14949,8 +14949,8 @@ define <vscale x 2 x double> @test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, i64 %vl, i64 6)
@@ -14962,8 +14962,8 @@ define <vscale x 2 x double> @test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -14976,8 +14976,8 @@ define <vscale x 2 x double> @test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, i64 %vl, i64 6)
@@ -14989,8 +14989,8 @@ define <vscale x 2 x double> @test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei16.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -15003,8 +15003,8 @@ define <vscale x 2 x double> @test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei32.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, i64 %vl, i64 6)
@@ -15016,8 +15016,8 @@ define <vscale x 2 x double> @test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei32.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -15165,8 +15165,8 @@ define <vscale x 2 x double> @test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, i64 %vl, i64 6)
@@ -15178,8 +15178,8 @@ define <vscale x 2 x double> @test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i8> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -15192,8 +15192,8 @@ define <vscale x 2 x double> @test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, i64 %vl, i64 6)
@@ -15205,8 +15205,8 @@ define <vscale x 2 x double> @test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei16.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei16.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i16> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -15219,8 +15219,8 @@ define <vscale x 2 x double> @test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_
 ; CHECK-LABEL: test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei32.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei32.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, i64 %vl, i64 6)
@@ -15232,8 +15232,8 @@ define <vscale x 2 x double> @test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv
 ; CHECK-LABEL: test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei32.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei32.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i32> %index, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
@@ -16029,8 +16029,8 @@ define <vscale x 8 x bfloat> @test_vluxseg2_nxv8bf16_triscv.vector.tuple_nxv16i8
 ; CHECK-LABEL: test_vluxseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -16042,8 +16042,8 @@ define <vscale x 8 x bfloat> @test_vluxseg2_mask_nxv8bf16_triscv.vector.tuple_nx
 ; CHECK-LABEL: test_vluxseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vluxseg2ei8.v v6, (a0), v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -16137,8 +16137,8 @@ define <vscale x 16 x bfloat> @test_vluxseg2_nxv16bf16_triscv.vector.tuple_nxv32
 ; CHECK-LABEL: test_vluxseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, i64 %vl, i64 4)
@@ -16150,8 +16150,8 @@ define <vscale x 16 x bfloat> @test_vluxseg2_mask_nxv16bf16_triscv.vector.tuple_
 ; CHECK-LABEL: test_vluxseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vluxseg2ei8.v v12, (a0), v8, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vluxseg2ei8.v v4, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1.nxv16i8(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i8> %index, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -16542,8 +16542,8 @@ define <vscale x 8 x bfloat> @test_vluxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8
 ; CHECK-LABEL: test_vluxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -16555,8 +16555,8 @@ define <vscale x 8 x bfloat> @test_vluxseg3_mask_nxv8bf16_triscv.vector.tuple_nx
 ; CHECK-LABEL: test_vluxseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg3ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vluxseg3ei8.v v6, (a0), v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
@@ -16974,8 +16974,8 @@ define <vscale x 8 x bfloat> @test_vluxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8
 ; CHECK-LABEL: test_vluxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, i64 %vl, i64 4)
@@ -16987,8 +16987,8 @@ define <vscale x 8 x bfloat> @test_vluxseg4_mask_nxv8bf16_triscv.vector.tuple_nx
 ; CHECK-LABEL: test_vluxseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vluxseg4ei8.v v10, (a0), v8, v0.t
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v14, v8
+; CHECK-NEXT:    vluxseg4ei8.v v6, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i8> %index, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)



More information about the llvm-commits mailing list