[llvm-branch-commits] [llvm] [RISCV] Don't use V0 directly in patterns (PR #88496)

Pengcheng Wang via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Apr 17 06:47:51 PDT 2024


https://github.com/wangpc-pp updated https://github.com/llvm/llvm-project/pull/88496

>From 80a1de2f538938ea505992bae2610754871d4761 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Fri, 12 Apr 2024 18:30:40 +0800
Subject: [PATCH 1/4] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.6-beta.1
---
 llvm/lib/Target/RISCV/RISCVFoldMasks.cpp      |   50 +-
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |   86 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.h        |    1 +
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    |   92 +-
 .../Target/RISCV/RISCVInstrInfoVSDPatterns.td |   34 +-
 .../Target/RISCV/RISCVInstrInfoVVLPatterns.td |  430 ++--
 llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td    |   66 +-
 llvm/lib/Target/RISCV/RISCVRegisterInfo.h     |    2 +
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td    |    2 +-
 .../instruction-select/rvv/select.mir         |  110 +-
 .../early-clobber-tied-def-subreg-liveness.ll |   14 +-
 .../RISCV/intrinsic-cttz-elts-vscale.ll       |    8 +-
 llvm/test/CodeGen/RISCV/rvv/abd.ll            |   24 +-
 llvm/test/CodeGen/RISCV/rvv/abs-vp.ll         |   36 +-
 llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll  |  234 +-
 llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll       |  172 +-
 llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll        |  197 +-
 .../RISCV/rvv/constant-folding-crash.ll       |   16 +-
 llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll        |   12 +-
 llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll       |  241 +-
 llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll    |   40 +-
 llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll        |  294 +--
 llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll  |   36 +-
 .../RISCV/rvv/fceil-constrained-sdnode.ll     |   20 +-
 .../RISCV/rvv/ffloor-constrained-sdnode.ll    |   20 +-
 .../RISCV/rvv/fixed-vectors-bitreverse-vp.ll  |  212 +-
 .../RISCV/rvv/fixed-vectors-bswap-vp.ll       |  170 +-
 .../rvv/fixed-vectors-buildvec-of-binop.ll    |    8 +-
 .../RISCV/rvv/fixed-vectors-ceil-vp.ll        |  182 +-
 .../RISCV/rvv/fixed-vectors-ctlz-vp.ll        |  616 ++----
 .../RISCV/rvv/fixed-vectors-ctpop-vp.ll       |  140 +-
 .../RISCV/rvv/fixed-vectors-cttz-vp.ll        |  596 ++---
 .../CodeGen/RISCV/rvv/fixed-vectors-cttz.ll   |   16 +-
 .../RISCV/rvv/fixed-vectors-extract-i1.ll     |   76 +-
 .../fixed-vectors-fceil-constrained-sdnode.ll |   20 +-
 ...fixed-vectors-ffloor-constrained-sdnode.ll |   20 +-
 .../RISCV/rvv/fixed-vectors-floor-vp.ll       |  182 +-
 .../RISCV/rvv/fixed-vectors-fmaximum.ll       |  158 +-
 .../RISCV/rvv/fixed-vectors-fminimum.ll       |  158 +-
 ...d-vectors-fnearbyint-constrained-sdnode.ll |   16 +-
 .../RISCV/rvv/fixed-vectors-fp-buildvec.ll    |    4 +-
 .../RISCV/rvv/fixed-vectors-fp-interleave.ll  |   17 +-
 .../RISCV/rvv/fixed-vectors-fp2i-sat.ll       |    8 +-
 .../fixed-vectors-fround-costrained-sdnode.ll |   20 +-
 ...d-vectors-froundeven-constrained-sdnode.ll |   20 +-
 .../RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll   |   48 +-
 ...fixed-vectors-ftrunc-constrained-sdnode.ll |   20 +-
 .../rvv/fixed-vectors-insert-subvector.ll     |   56 +-
 .../RISCV/rvv/fixed-vectors-int-interleave.ll |   21 +-
 .../RISCV/rvv/fixed-vectors-int-shuffles.ll   |   30 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-int.ll    |   38 +-
 .../rvv/fixed-vectors-interleaved-access.ll   |  612 +++---
 .../RISCV/rvv/fixed-vectors-mask-buildvec.ll  |   24 +-
 .../RISCV/rvv/fixed-vectors-mask-splat.ll     |    6 +-
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  |   30 +-
 .../RISCV/rvv/fixed-vectors-masked-load-fp.ll |   56 +-
 .../rvv/fixed-vectors-masked-load-int.ll      |   54 +-
 .../rvv/fixed-vectors-masked-store-fp.ll      |  160 +-
 .../rvv/fixed-vectors-masked-store-int.ll     |  201 +-
 .../RISCV/rvv/fixed-vectors-nearbyint-vp.ll   |  141 +-
 .../rvv/fixed-vectors-reduction-fp-vp.ll      |   24 +-
 .../RISCV/rvv/fixed-vectors-reduction-fp.ll   |  276 +--
 .../rvv/fixed-vectors-reduction-int-vp.ll     |   12 +-
 .../RISCV/rvv/fixed-vectors-rint-vp.ll        |  154 +-
 .../RISCV/rvv/fixed-vectors-round-vp.ll       |  182 +-
 .../RISCV/rvv/fixed-vectors-roundeven-vp.ll   |  182 +-
 .../RISCV/rvv/fixed-vectors-roundtozero-vp.ll |  182 +-
 .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll    |   99 +-
 .../RISCV/rvv/fixed-vectors-setcc-int-vp.ll   |   94 +-
 .../rvv/fixed-vectors-shuffle-transpose.ll    |   22 +-
 .../RISCV/rvv/fixed-vectors-strided-vpload.ll |   48 +-
 .../RISCV/rvv/fixed-vectors-trunc-vp.ll       |  277 ---
 .../CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll |   16 +-
 .../RISCV/rvv/fixed-vectors-vadd-vp.ll        |    4 +-
 .../RISCV/rvv/fixed-vectors-vand-vp.ll        |   22 +-
 .../RISCV/rvv/fixed-vectors-vcopysign-vp.ll   |    2 +-
 .../fixed-vectors-vfcmp-constrained-sdnode.ll | 1296 +++++------
 .../RISCV/rvv/fixed-vectors-vfma-vp.ll        |    2 +-
 .../RISCV/rvv/fixed-vectors-vfmax-vp.ll       |    2 +-
 .../RISCV/rvv/fixed-vectors-vfmin-vp.ll       |    2 +-
 .../RISCV/rvv/fixed-vectors-vfmuladd-vp.ll    |    2 +-
 .../RISCV/rvv/fixed-vectors-vmax-vp.ll        |    4 +-
 .../RISCV/rvv/fixed-vectors-vmaxu-vp.ll       |    4 +-
 .../RISCV/rvv/fixed-vectors-vmin-vp.ll        |    4 +-
 .../RISCV/rvv/fixed-vectors-vminu-vp.ll       |    4 +-
 .../RISCV/rvv/fixed-vectors-vpgather.ll       |    4 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vpload.ll |   38 +-
 .../RISCV/rvv/fixed-vectors-vpscatter.ll      |   64 +-
 .../RISCV/rvv/fixed-vectors-vsadd-vp.ll       |    4 +-
 .../RISCV/rvv/fixed-vectors-vsaddu-vp.ll      |    4 +-
 .../RISCV/rvv/fixed-vectors-vselect-vp.ll     |   63 +-
 .../RISCV/rvv/fixed-vectors-vselect.ll        |   24 +-
 .../RISCV/rvv/fixed-vectors-vssub-vp.ll       |   12 +-
 .../RISCV/rvv/fixed-vectors-vssubu-vp.ll      |   12 +-
 llvm/test/CodeGen/RISCV/rvv/floor-vp.ll       |  197 +-
 .../test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll |  190 +-
 .../test/CodeGen/RISCV/rvv/fminimum-sdnode.ll |  190 +-
 .../rvv/fnearbyint-constrained-sdnode.ll      |   20 +-
 .../RISCV/rvv/fround-costrained-sdnode.ll     |   20 +-
 .../rvv/froundeven-constrained-sdnode.ll      |   20 +-
 llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll   |  516 +----
 .../RISCV/rvv/ftrunc-constrained-sdnode.ll    |   20 +-
 .../CodeGen/RISCV/rvv/implicit-def-copy.ll    |    4 +-
 .../CodeGen/RISCV/rvv/insert-subvector.ll     |   20 +-
 llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll      |    2 +-
 llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll       |    2 +-
 llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll |   19 +-
 .../RISCV/rvv/mutate-prior-vsetvli-avl.ll     |   12 +-
 llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll   |  339 ++-
 .../RISCV/rvv/pass-fast-math-flags-sdnode.ll  |    4 +-
 llvm/test/CodeGen/RISCV/rvv/rint-vp.ll        |  331 ++-
 llvm/test/CodeGen/RISCV/rvv/round-vp.ll       |  331 ++-
 llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll   |  331 ++-
 llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll |  331 ++-
 .../RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir  |    4 +-
 .../RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll |    8 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll    |  227 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll   |   74 +-
 .../test/CodeGen/RISCV/rvv/shuffle-reverse.ll |   38 +-
 .../CodeGen/RISCV/rvv/sink-splat-operands.ll  |   24 +-
 llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll   |  104 +-
 .../rvv/strided-vpload-vpstore-output.ll      |    8 +-
 llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll |   26 +-
 .../test/CodeGen/RISCV/rvv/strided-vpstore.ll |   25 +-
 llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll   |  176 +-
 llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll        |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll   |    2 +-
 .../RISCV/rvv/vector-deinterleave-load.ll     |    9 +-
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  |   37 +-
 .../RISCV/rvv/vector-interleave-store.ll      |   26 +-
 .../CodeGen/RISCV/rvv/vector-interleave.ll    |   89 +-
 llvm/test/CodeGen/RISCV/rvv/vector-splice.ll  |  160 +-
 llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll       |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll       |   20 +-
 .../RISCV/rvv/vfcmp-constrained-sdnode.ll     | 1944 ++++++++---------
 llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll       |   20 +-
 llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll        |   89 +-
 llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll       |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll       |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll       |   20 +-
 llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll    |   89 +-
 llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll       |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll      |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll     |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll     |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll    |   95 +-
 llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll      |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll       |   20 +-
 llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll    |   26 +-
 llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll    |   26 +-
 .../RISCV/rvv/vleff-vlseg2ff-output.ll        |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll        |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll       |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vmfeq.ll          |   72 +-
 llvm/test/CodeGen/RISCV/rvv/vmfge.ll          |   72 +-
 llvm/test/CodeGen/RISCV/rvv/vmfgt.ll          |   72 +-
 llvm/test/CodeGen/RISCV/rvv/vmfle.ll          |   72 +-
 llvm/test/CodeGen/RISCV/rvv/vmflt.ll          |   72 +-
 llvm/test/CodeGen/RISCV/rvv/vmfne.ll          |   72 +-
 llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll        |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll       |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vmseq.ll          |  114 +-
 llvm/test/CodeGen/RISCV/rvv/vmsgt.ll          |  114 +-
 llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll         |  114 +-
 llvm/test/CodeGen/RISCV/rvv/vmsle.ll          |  114 +-
 llvm/test/CodeGen/RISCV/rvv/vmsleu.ll         |  114 +-
 llvm/test/CodeGen/RISCV/rvv/vmslt.ll          |  114 +-
 llvm/test/CodeGen/RISCV/rvv/vmsltu.ll         |  114 +-
 llvm/test/CodeGen/RISCV/rvv/vmsne.ll          |  114 +-
 llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll |    8 +-
 .../rvv/vp-reverse-mask-fixed-vectors.ll      |   36 +-
 .../test/CodeGen/RISCV/rvv/vp-reverse-mask.ll |   67 +-
 .../RISCV/rvv/vp-splice-mask-fixed-vectors.ll |  168 +-
 .../RISCV/rvv/vp-splice-mask-vectors.ll       |  300 +--
 .../test/CodeGen/RISCV/rvv/vpgather-sdnode.ll |   47 +-
 llvm/test/CodeGen/RISCV/rvv/vpload.ll         |   38 +-
 llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll |   35 +-
 llvm/test/CodeGen/RISCV/rvv/vpstore.ll        |   25 +-
 .../CodeGen/RISCV/rvv/vreductions-fp-vp.ll    |   24 +-
 .../CodeGen/RISCV/rvv/vreductions-int-vp.ll   |   34 +-
 llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll       |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll      |    8 +-
 .../RISCV/rvv/vscale-vw-web-simplification.ll |  200 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll     |   37 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll     |   88 +-
 llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll       |    4 +-
 llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll     |   10 +-
 llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll       |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll      |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll      |   95 +-
 llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll     |   10 +-
 llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll       |    4 +-
 192 files changed, 8022 insertions(+), 10527 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
index 2089f5dda6fe52..0425e1411107e0 100644
--- a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
@@ -50,10 +50,7 @@ class RISCVFoldMasks : public MachineFunctionPass {
   bool convertToUnmasked(MachineInstr &MI) const;
   bool convertVMergeToVMv(MachineInstr &MI) const;
 
-  bool isAllOnesMask(const MachineInstr *MaskDef) const;
-
-  /// Maps uses of V0 to the corresponding def of V0.
-  DenseMap<const MachineInstr *, const MachineInstr *> V0Defs;
+  bool isAllOnesMask(const MachineOperand& MaskOp) const;
 };
 
 } // namespace
@@ -62,12 +59,22 @@ char RISCVFoldMasks::ID = 0;
 
 INITIALIZE_PASS(RISCVFoldMasks, DEBUG_TYPE, "RISC-V Fold Masks", false, false)
 
-bool RISCVFoldMasks::isAllOnesMask(const MachineInstr *MaskDef) const {
-  assert(MaskDef && MaskDef->isCopy() &&
-         MaskDef->getOperand(0).getReg() == RISCV::V0);
+bool RISCVFoldMasks::isAllOnesMask(const MachineOperand &MaskOp) const {
+  if (!MaskOp.isReg())
+    return false;
+
+  Register MaskReg = MaskOp.getReg();
+  if (!MaskReg.isVirtual())
+    return false;
+
+  MachineInstr *MaskDef = MRI->getVRegDef(MaskReg);
+  if (!MaskDef || !MaskDef->isCopy())
+    return false;
+
   Register SrcReg = TRI->lookThruCopyLike(MaskDef->getOperand(1).getReg(), MRI);
   if (!SrcReg.isVirtual())
     return false;
+
   MaskDef = MRI->getVRegDef(SrcReg);
   if (!MaskDef)
     return false;
@@ -116,8 +123,8 @@ bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI) const {
                                            TRI->lookThruCopyLike(FalseReg, MRI))
     return false;
 
-  assert(MI.getOperand(4).isReg() && MI.getOperand(4).getReg() == RISCV::V0);
-  if (!isAllOnesMask(V0Defs.lookup(&MI)))
+  // assert(MI.getOperand(4).isReg() && MI.getOperand(4).getReg() == RISCV::V0);
+  if (!isAllOnesMask(MI.getOperand(4)))
     return false;
 
   MI.setDesc(TII->get(NewOpc));
@@ -140,7 +147,9 @@ bool RISCVFoldMasks::convertToUnmasked(MachineInstr &MI) const {
   if (!I)
     return false;
 
-  if (!isAllOnesMask(V0Defs.lookup(&MI)))
+  // TODO: Increment all MaskOpIdxs in tablegen by num of explicit defs?
+  unsigned MaskOpIdx = I->MaskOpIdx + MI.getNumExplicitDefs();
+  if (!isAllOnesMask(MI.getOperand(MaskOpIdx)))
     return false;
 
   // There are two classes of pseudos in the table - compares and
@@ -160,9 +169,6 @@ bool RISCVFoldMasks::convertToUnmasked(MachineInstr &MI) const {
   (void)HasPolicyOp;
 
   MI.setDesc(MCID);
-
-  // TODO: Increment all MaskOpIdxs in tablegen by num of explicit defs?
-  unsigned MaskOpIdx = I->MaskOpIdx + MI.getNumExplicitDefs();
   MI.removeOperand(MaskOpIdx);
 
   // The unmasked pseudo will no longer be constrained to the vrnov0 reg class,
@@ -193,24 +199,6 @@ bool RISCVFoldMasks::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
 
-  // Masked pseudos coming out of isel will have their mask operand in the form:
-  //
-  // $v0:vr = COPY %mask:vr
-  // %x:vr = Pseudo_MASK %a:vr, %b:br, $v0:vr
-  //
-  // Because $v0 isn't in SSA, keep track of its definition at each use so we
-  // can check mask operands.
-  for (const MachineBasicBlock &MBB : MF) {
-    const MachineInstr *CurrentV0Def = nullptr;
-    for (const MachineInstr &MI : MBB) {
-      if (MI.readsRegister(RISCV::V0, TRI))
-        V0Defs[&MI] = CurrentV0Def;
-
-      if (MI.definesRegister(RISCV::V0, TRI))
-        CurrentV0Def = &MI;
-    }
-  }
-
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
       Changed |= convertToUnmasked(MI);
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index f99dc0b8576368..81543768d6a646 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -296,7 +296,6 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
     bool IsMasked, bool IsStridedOrIndexed, SmallVectorImpl<SDValue> &Operands,
     bool IsLoad, MVT *IndexVT) {
   SDValue Chain = Node->getOperand(0);
-  SDValue Glue;
 
   Operands.push_back(Node->getOperand(CurOp++)); // Base pointer.
 
@@ -307,11 +306,8 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
   }
 
   if (IsMasked) {
-    // Mask needs to be copied to V0.
     SDValue Mask = Node->getOperand(CurOp++);
-    Chain = CurDAG->getCopyToReg(Chain, DL, RISCV::V0, Mask, SDValue());
-    Glue = Chain.getValue(1);
-    Operands.push_back(CurDAG->getRegister(RISCV::V0, Mask.getValueType()));
+    Operands.push_back(Mask);
   }
   SDValue VL;
   selectVLOp(Node->getOperand(CurOp++), VL);
@@ -333,8 +329,6 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
   }
 
   Operands.push_back(Chain); // Chain.
-  if (Glue)
-    Operands.push_back(Glue);
 }
 
 void RISCVDAGToDAGISel::selectVLSEG(SDNode *Node, bool IsMasked,
@@ -1670,12 +1664,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
         return;
       }
 
-      // Mask needs to be copied to V0.
-      SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL,
-                                           RISCV::V0, Mask, SDValue());
-      SDValue Glue = Chain.getValue(1);
-      SDValue V0 = CurDAG->getRegister(RISCV::V0, VT);
-
       // Otherwise use
       // vmslt{u}.vx vd, va, x, v0.t; vmxor.mm vd, vd, v0
       // The result is mask undisturbed.
@@ -1683,7 +1671,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       // the agnostic result can be either undisturbed or all 1.
       SDValue Cmp = SDValue(
           CurDAG->getMachineNode(VMSLTMaskOpcode, DL, VT,
-                                 {MaskedOff, Src1, Src2, V0, VL, SEW, Glue}),
+                                 {MaskedOff, Src1, Src2, Mask, VL, SEW}),
           0);
       // vmxor.mm vd, vd, v0 is used to update active value.
       ReplaceNode(Node, CurDAG->getMachineNode(VMXOROpcode, DL, VT,
@@ -3426,32 +3414,7 @@ bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) {
   return false;
 }
 
-static bool usesAllOnesMask(SDValue MaskOp, SDValue GlueOp) {
-  // Check that we're using V0 as a mask register.
-  if (!isa<RegisterSDNode>(MaskOp) ||
-      cast<RegisterSDNode>(MaskOp)->getReg() != RISCV::V0)
-    return false;
-
-  // The glued user defines V0.
-  const auto *Glued = GlueOp.getNode();
-
-  if (!Glued || Glued->getOpcode() != ISD::CopyToReg)
-    return false;
-
-  // Check that we're defining V0 as a mask register.
-  if (!isa<RegisterSDNode>(Glued->getOperand(1)) ||
-      cast<RegisterSDNode>(Glued->getOperand(1))->getReg() != RISCV::V0)
-    return false;
-
-  // Check the instruction defining V0; it needs to be a VMSET pseudo.
-  SDValue MaskSetter = Glued->getOperand(2);
-
-  // Sometimes the VMSET is wrapped in a COPY_TO_REGCLASS, e.g. if the mask came
-  // from an extract_subvector or insert_subvector.
-  if (MaskSetter->isMachineOpcode() &&
-      MaskSetter->getMachineOpcode() == RISCV::COPY_TO_REGCLASS)
-    MaskSetter = MaskSetter->getOperand(0);
-
+static bool usesAllOnesMask(SDValue MaskOp) {
   const auto IsVMSet = [](unsigned Opc) {
     return Opc == RISCV::PseudoVMSET_M_B1 || Opc == RISCV::PseudoVMSET_M_B16 ||
            Opc == RISCV::PseudoVMSET_M_B2 || Opc == RISCV::PseudoVMSET_M_B32 ||
@@ -3462,14 +3425,13 @@ static bool usesAllOnesMask(SDValue MaskOp, SDValue GlueOp) {
   // TODO: Check that the VMSET is the expected bitwidth? The pseudo has
   // undefined behaviour if it's the wrong bitwidth, so we could choose to
   // assume that it's all-ones? Same applies to its VL.
-  return MaskSetter->isMachineOpcode() &&
-         IsVMSet(MaskSetter.getMachineOpcode());
+  return MaskOp->isMachineOpcode() &&
+         IsVMSet(MaskOp.getMachineOpcode());
 }
 
 // Return true if we can make sure mask of N is all-ones mask.
 static bool usesAllOnesMask(SDNode *N, unsigned MaskOpIdx) {
-  return usesAllOnesMask(N->getOperand(MaskOpIdx),
-                         N->getOperand(N->getNumOperands() - 1));
+  return usesAllOnesMask(N->getOperand(MaskOpIdx));
 }
 
 static bool isImplicitDef(SDValue V) {
@@ -3515,10 +3477,10 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) {
     Ops.push_back(Op);
   }
 
-  // Transitively apply any node glued to our new node.
-  const auto *Glued = N->getGluedNode();
-  if (auto *TGlued = Glued->getGluedNode())
-    Ops.push_back(SDValue(TGlued, TGlued->getNumValues() - 1));
+  // // Transitively apply any node glued to our new node.
+  // const auto *Glued = N->getGluedNode();
+  // if (auto *TGlued = Glued->getGluedNode())
+  //   Ops.push_back(SDValue(TGlued, TGlued->getNumValues() - 1));
 
   MachineSDNode *Result =
       CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
@@ -3584,7 +3546,7 @@ static unsigned GetVMSetForLMul(RISCVII::VLMUL LMUL) {
 // The resulting policy is the effective policy the vmerge would have had,
 // i.e. whether or not it's merge operand was implicit-def.
 bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
-  SDValue Merge, False, True, VL, Mask, Glue;
+  SDValue Merge, False, True, VL, Mask;
   // A vmv.v.v is equivalent to a vmerge with an all-ones mask.
   if (IsVMv(N)) {
     Merge = N->getOperand(0);
@@ -3600,11 +3562,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
     True = N->getOperand(2);
     Mask = N->getOperand(3);
     VL = N->getOperand(4);
-    // We always have a glue node for the mask at v0.
-    Glue = N->getOperand(N->getNumOperands() - 1);
   }
-  assert(!Mask || cast<RegisterSDNode>(Mask)->getReg() == RISCV::V0);
-  assert(!Glue || Glue.getValueType() == MVT::Glue);
 
   // We require that either merge and false are the same, or that merge
   // is undefined.
@@ -3639,7 +3597,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
 
   // When Mask is not a true mask, this transformation is illegal for some
   // operations whose results are affected by mask, like viota.m.
-  if (Info->MaskAffectsResult && Mask && !usesAllOnesMask(Mask, Glue))
+  if (Info->MaskAffectsResult && Mask && !usesAllOnesMask(Mask))
     return false;
 
   // If True has a merge operand then it needs to be the same as vmerge's False,
@@ -3664,7 +3622,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
       return false;
     // FIXME: Support mask agnostic True instruction which would have an
     // undef merge operand.
-    if (Mask && !usesAllOnesMask(Mask, Glue))
+    if (Mask && !usesAllOnesMask(Mask))
       return false;
   }
 
@@ -3691,8 +3649,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
     if (Mask)
       LoopWorklist.push_back(Mask.getNode());
     LoopWorklist.push_back(VL.getNode());
-    if (Glue)
-      LoopWorklist.push_back(Glue.getNode());
     if (SDNode::hasPredecessorHelper(True.getNode(), Visited, LoopWorklist))
       return false;
   }
@@ -3737,11 +3693,8 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
 
   // From the preconditions we checked above, we know the mask and thus glue
   // for the result node will be taken from True.
-  if (IsMasked) {
+  if (IsMasked)
     Mask = True->getOperand(Info->MaskOpIdx);
-    Glue = True->getOperand(True->getNumOperands() - 1);
-    assert(Glue.getValueType() == MVT::Glue);
-  }
   // If we end up using the vmerge mask the vmerge is actually a vmv.v.v, create
   // an all-ones mask to use.
   else if (IsVMv(N)) {
@@ -3749,13 +3702,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
     unsigned VMSetOpc = GetVMSetForLMul(RISCVII::getLMul(TSFlags));
     ElementCount EC = N->getValueType(0).getVectorElementCount();
     MVT MaskVT = MVT::getVectorVT(MVT::i1, EC);
-
-    SDValue AllOnesMask =
-        SDValue(CurDAG->getMachineNode(VMSetOpc, DL, MaskVT, VL, SEW), 0);
-    SDValue MaskCopy = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL,
-                                            RISCV::V0, AllOnesMask, SDValue());
-    Mask = CurDAG->getRegister(RISCV::V0, MaskVT);
-    Glue = MaskCopy.getValue(1);
+    Mask = SDValue(CurDAG->getMachineNode(VMSetOpc, DL, MaskVT, VL, SEW), 0);
   }
 
   unsigned MaskedOpc = Info->MaskedPseudo;
@@ -3806,9 +3753,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   if (HasChainOp)
     Ops.push_back(True.getOperand(TrueChainOpIdx));
 
-  // Add the glue for the CopyToReg of mask->v0.
-  Ops.push_back(Glue);
-
   MachineSDNode *Result =
       CurDAG->getMachineNode(MaskedOpc, DL, True->getVTList(), Ops);
   Result->setFlags(True->getFlags());
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 70fe7da85be0e7..804b9e8ffb2fdd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -279,6 +279,7 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
 
   unsigned getUndefInitOpcode(unsigned RegClassID) const override {
     switch (RegClassID) {
+    case RISCV::VMV0RegClassID:
     case RISCV::VRRegClassID:
       return RISCV::PseudoRVVInitUndefM1;
     case RISCV::VRM2RegClassID:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index cf9a31c23a06e0..8f5467ff8604cc 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -3972,12 +3972,12 @@ class VPatUnaryMask<string intrinsic_name,
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
                    (result_type result_reg_class:$merge),
                    (op2_type op2_reg_class:$rs2),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_MASK")
                    (result_type result_reg_class:$merge),
                    (op2_type op2_reg_class:$rs2),
-                   (mask_type V0), GPR:$vl, log2sew, (XLenVT timm:$policy))>;
+                   (mask_type VMV0:$vm), GPR:$vl, log2sew, (XLenVT timm:$policy))>;
 
 class VPatUnaryMaskRoundingMode<string intrinsic_name,
                                 string inst,
@@ -3993,7 +3993,7 @@ class VPatUnaryMaskRoundingMode<string intrinsic_name,
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
                    (result_type result_reg_class:$merge),
                    (op2_type op2_reg_class:$rs2),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    (XLenVT timm:$round),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(
@@ -4002,7 +4002,7 @@ class VPatUnaryMaskRoundingMode<string intrinsic_name,
                           inst#"_"#kind#"_"#vlmul.MX#"_MASK"))
                    (result_type result_reg_class:$merge),
                    (op2_type op2_reg_class:$rs2),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    (XLenVT timm:$round),
                    GPR:$vl, log2sew, (XLenVT timm:$policy))>;
 
@@ -4024,12 +4024,12 @@ class VPatMaskUnaryMask<string intrinsic_name,
   Pat<(mti.Mask (!cast<Intrinsic>(intrinsic_name#"_mask")
                 (mti.Mask VR:$merge),
                 (mti.Mask VR:$rs2),
-                (mti.Mask V0),
+                (mti.Mask VMV0:$vm),
                 VLOpFrag)),
                 (!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK")
                 (mti.Mask VR:$merge),
                 (mti.Mask VR:$rs2),
-                (mti.Mask V0), GPR:$vl, mti.Log2SEW, TU_MU)>;
+                (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW, TU_MU)>;
 
 class VPatUnaryAnyMask<string intrinsic,
                        string inst,
@@ -4165,13 +4165,13 @@ class VPatBinaryMask<string intrinsic_name,
                    (result_type result_reg_class:$merge),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_MASK")
                    (result_type result_reg_class:$merge),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0), GPR:$vl, sew)>;
+                   (mask_type VMV0:$vm), GPR:$vl, sew)>;
 
 class VPatBinaryMaskTA<string intrinsic_name,
                        string inst,
@@ -4187,13 +4187,13 @@ class VPatBinaryMaskTA<string intrinsic_name,
                    (result_type result_reg_class:$merge),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK")
                    (result_type result_reg_class:$merge),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
+                   (mask_type VMV0:$vm), GPR:$vl, sew, (XLenVT timm:$policy))>;
 
 class VPatBinaryMaskTARoundingMode<string intrinsic_name,
                                    string inst,
@@ -4209,14 +4209,14 @@ class VPatBinaryMaskTARoundingMode<string intrinsic_name,
                    (result_type result_reg_class:$merge),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    (XLenVT timm:$round),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK")
                    (result_type result_reg_class:$merge),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    (XLenVT timm:$round),
                    GPR:$vl, sew, (XLenVT timm:$policy))>;
 
@@ -4235,13 +4235,13 @@ class VPatBinaryMaskSwapped<string intrinsic_name,
                    (result_type result_reg_class:$merge),
                    (op2_type op2_kind:$rs2),
                    (op1_type op1_reg_class:$rs1),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_MASK")
                    (result_type result_reg_class:$merge),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0), GPR:$vl, sew)>;
+                   (mask_type VMV0:$vm), GPR:$vl, sew)>;
 
 class VPatTiedBinaryNoMask<string intrinsic_name,
                            string inst,
@@ -4327,12 +4327,12 @@ class VPatTiedBinaryMask<string intrinsic_name,
                    (result_type result_reg_class:$merge),
                    (result_type result_reg_class:$merge),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK_TIED")
                    (result_type result_reg_class:$merge),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
+                   (mask_type VMV0:$vm), GPR:$vl, sew, (XLenVT timm:$policy))>;
 
 class VPatTiedBinaryMaskRoundingMode<string intrinsic_name,
                                      string inst,
@@ -4346,13 +4346,13 @@ class VPatTiedBinaryMaskRoundingMode<string intrinsic_name,
                    (result_type result_reg_class:$merge),
                    (result_type result_reg_class:$merge),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    (XLenVT timm:$round),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK_TIED")
                    (result_type result_reg_class:$merge),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    (XLenVT timm:$round),
                    GPR:$vl, sew, (XLenVT timm:$policy))>;
 
@@ -4486,13 +4486,13 @@ class VPatTernaryMask<string intrinsic,
                     (result_type result_reg_class:$rs3),
                     (op1_type op1_reg_class:$rs1),
                     (op2_type op2_kind:$rs2),
-                    (mask_type V0),
+                    (mask_type VMV0:$vm),
                     VLOpFrag)),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX # "_MASK")
                     result_reg_class:$rs3,
                     (op1_type op1_reg_class:$rs1),
                     op2_kind:$rs2,
-                    (mask_type V0),
+                    (mask_type VMV0:$vm),
                     GPR:$vl, sew)>;
 
 class VPatTernaryMaskPolicy<string intrinsic,
@@ -4511,13 +4511,13 @@ class VPatTernaryMaskPolicy<string intrinsic,
                     (result_type result_reg_class:$rs3),
                     (op1_type op1_reg_class:$rs1),
                     (op2_type op2_kind:$rs2),
-                    (mask_type V0),
+                    (mask_type VMV0:$vm),
                     VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX # "_MASK")
                     result_reg_class:$rs3,
                     (op1_type op1_reg_class:$rs1),
                     op2_kind:$rs2,
-                    (mask_type V0),
+                    (mask_type VMV0:$vm),
                     GPR:$vl, sew, (XLenVT timm:$policy))>;
 
 class VPatTernaryMaskPolicyRoundingMode<string intrinsic,
@@ -4536,14 +4536,14 @@ class VPatTernaryMaskPolicyRoundingMode<string intrinsic,
                     (result_type result_reg_class:$rs3),
                     (op1_type op1_reg_class:$rs1),
                     (op2_type op2_kind:$rs2),
-                    (mask_type V0),
+                    (mask_type VMV0:$vm),
                     (XLenVT timm:$round),
                     VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX # "_MASK")
                     result_reg_class:$rs3,
                     (op1_type op1_reg_class:$rs1),
                     op2_kind:$rs2,
-                    (mask_type V0),
+                    (mask_type VMV0:$vm),
                     (XLenVT timm:$round),
                     GPR:$vl, sew, (XLenVT timm:$policy))>;
 
@@ -4563,13 +4563,13 @@ class VPatTernaryMaskTA<string intrinsic,
                     (result_type result_reg_class:$rs3),
                     (op1_type op1_reg_class:$rs1),
                     (op2_type op2_kind:$rs2),
-                    (mask_type V0),
+                    (mask_type VMV0:$vm),
                     VLOpFrag)),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew)# "_MASK")
                     result_reg_class:$rs3,
                     (op1_type op1_reg_class:$rs1),
                     op2_kind:$rs2,
-                    (mask_type V0),
+                    (mask_type VMV0:$vm),
                     GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
 
 class VPatTernaryMaskTARoundingMode<string intrinsic,
@@ -4588,14 +4588,14 @@ class VPatTernaryMaskTARoundingMode<string intrinsic,
                     (result_type result_reg_class:$rs3),
                     (op1_type op1_reg_class:$rs1),
                     (op2_type op2_kind:$rs2),
-                    (mask_type V0),
+                    (mask_type VMV0:$vm),
                     (XLenVT timm:$round),
                     VLOpFrag)),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew)# "_MASK")
                     result_reg_class:$rs3,
                     (op1_type op1_reg_class:$rs1),
                     op2_kind:$rs2,
-                    (mask_type V0),
+                    (mask_type VMV0:$vm),
                     (XLenVT timm:$round),
                     GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
 
@@ -4607,9 +4607,9 @@ multiclass VPatUnaryS_M<string intrinsic_name,
                       (!cast<Instruction>(inst#"_M_"#mti.BX) $rs1,
                       GPR:$vl, mti.Log2SEW)>;
     def : Pat<(XLenVT (!cast<Intrinsic>(intrinsic_name # "_mask")
-                      (mti.Mask VR:$rs1), (mti.Mask V0), VLOpFrag)),
+                      (mti.Mask VR:$rs1), (mti.Mask VMV0:$vm), VLOpFrag)),
                       (!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK") $rs1,
-                      (mti.Mask V0), GPR:$vl, mti.Log2SEW)>;
+                      (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW)>;
   }
 }
 
@@ -4697,9 +4697,9 @@ multiclass VPatNullaryV<string intrinsic, string instruction> {
                             vti.RegClass:$merge, GPR:$vl, vti.Log2SEW, TU_MU)>;
       def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic # "_mask")
                             (vti.Vector vti.RegClass:$merge),
-                            (vti.Mask V0), VLOpFrag, (XLenVT timm:$policy))),
+                            (vti.Mask VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))),
                             (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX # "_MASK")
-                            vti.RegClass:$merge, (vti.Mask V0),
+                            vti.RegClass:$merge, (vti.Mask VMV0:$vm),
                             GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
   }
   }
@@ -4799,13 +4799,13 @@ multiclass VPatBinaryCarryInTAIL<string intrinsic,
                          (result_type result_reg_class:$merge),
                          (op1_type op1_reg_class:$rs1),
                          (op2_type op2_kind:$rs2),
-                         (mask_type V0),
+                         (mask_type VMV0:$vm),
                          VLOpFrag)),
                          (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
                          (result_type result_reg_class:$merge),
                          (op1_type op1_reg_class:$rs1),
                          (op2_type op2_kind:$rs2),
-                         (mask_type V0), GPR:$vl, sew)>;
+                         (mask_type VMV0:$vm), GPR:$vl, sew)>;
 }
 
 multiclass VPatBinaryCarryIn<string intrinsic,
@@ -4822,12 +4822,12 @@ multiclass VPatBinaryCarryIn<string intrinsic,
   def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
                          (op1_type op1_reg_class:$rs1),
                          (op2_type op2_kind:$rs2),
-                         (mask_type V0),
+                         (mask_type VMV0:$vm),
                          VLOpFrag)),
                          (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
                          (op1_type op1_reg_class:$rs1),
                          (op2_type op2_kind:$rs2),
-                         (mask_type V0), GPR:$vl, sew)>;
+                         (mask_type VMV0:$vm), GPR:$vl, sew)>;
 }
 
 multiclass VPatBinaryMaskOut<string intrinsic,
@@ -6022,10 +6022,10 @@ multiclass VPatCompare_VI<string intrinsic, string inst,
     def : Pat<(vti.Mask (IntrMask (vti.Mask VR:$merge),
                                   (vti.Vector vti.RegClass:$rs1),
                                   (vti.Scalar ImmType:$rs2),
-                                  (vti.Mask V0),
+                                  (vti.Mask VMV0:$vm),
                                   VLOpFrag)),
               (PseudoMask VR:$merge, vti.RegClass:$rs1, (DecImm ImmType:$rs2),
-                          (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+                          (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
   }
 }
 
@@ -6182,14 +6182,14 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector (int_riscv_vrsub_mask (vti.Vector vti.RegClass:$merge),
                                                 (vti.Vector vti.RegClass:$rs2),
                                                 (vti.Vector vti.RegClass:$rs1),
-                                                (vti.Mask V0),
+                                                (vti.Mask VMV0:$vm),
                                                 VLOpFrag,
                                                 (XLenVT timm:$policy))),
               (!cast<Instruction>("PseudoVSUB_VV_"#vti.LMul.MX#"_MASK")
                                                         vti.RegClass:$merge,
                                                         vti.RegClass:$rs1,
                                                         vti.RegClass:$rs2,
-                                                        (vti.Mask V0),
+                                                        (vti.Mask VMV0:$vm),
                                                         GPR:$vl,
                                                         vti.Log2SEW,
                                                         (XLenVT timm:$policy))>;
@@ -6207,14 +6207,14 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector (int_riscv_vsub_mask (vti.Vector vti.RegClass:$merge),
                                                (vti.Vector vti.RegClass:$rs1),
                                                (vti.Scalar simm5_plus1:$rs2),
-                                               (vti.Mask V0),
+                                               (vti.Mask VMV0:$vm),
                                                VLOpFrag,
                                                (XLenVT timm:$policy))),
               (!cast<Instruction>("PseudoVADD_VI_"#vti.LMul.MX#"_MASK")
                                                         vti.RegClass:$merge,
                                                         vti.RegClass:$rs1,
                                                         (NegImm simm5_plus1:$rs2),
-                                                        (vti.Mask V0),
+                                                        (vti.Mask VMV0:$vm),
                                                         GPR:$vl,
                                                         vti.Log2SEW,
                                                         (XLenVT timm:$policy))>;
@@ -6869,14 +6869,14 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector (int_riscv_vsll_mask (vti.Vector vti.RegClass:$merge),
                                                (vti.Vector vti.RegClass:$rs1),
                                                (XLenVT 1),
-                                               (vti.Mask V0),
+                                               (vti.Mask VMV0:$vm),
                                                VLOpFrag,
                                                (XLenVT timm:$policy))),
               (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX#"_MASK")
                                                           vti.RegClass:$merge,
                                                           vti.RegClass:$rs1,
                                                           vti.RegClass:$rs1,
-                                                          (vti.Mask V0),
+                                                          (vti.Mask VMV0:$vm),
                                                           GPR:$vl,
                                                           vti.Log2SEW,
                                                           (XLenVT timm:$policy))>;
@@ -7200,9 +7200,9 @@ foreach fvti = AllFloatVectors in {
   def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$merge),
                                             (fvti.Vector fvti.RegClass:$rs2),
                                             (fvti.Scalar (fpimm0)),
-                                            (fvti.Mask V0), VLOpFrag)),
+                                            (fvti.Mask VMV0:$vm), VLOpFrag)),
             (instr fvti.RegClass:$merge, fvti.RegClass:$rs2, 0,
-                   (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
+                   (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index da761ae856706a..3596ccf27c958b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -976,16 +976,16 @@ foreach vtiToWti = AllWidenableIntVectors in {
               (!cast<Instruction>("PseudoVWADDU_VV_"#vti.LMul.MX)
                   (wti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.RegClass:$rs1,
                   vti.AVL, vti.Log2SEW, TA_MA)>;
-    def : Pat<(shl (wti.Vector (riscv_sext_vl_oneuse (vti.Vector vti.RegClass:$rs1), (vti.Mask V0), VLOpFrag)),
+    def : Pat<(shl (wti.Vector (riscv_sext_vl_oneuse (vti.Vector vti.RegClass:$rs1), (vti.Mask VMV0:$vm), VLOpFrag)),
                    (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), 1, (XLenVT srcvalue)))),
               (!cast<Instruction>("PseudoVWADD_VV_"#vti.LMul.MX#"_MASK")
                   (wti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.RegClass:$rs1,
-                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-    def : Pat<(shl (wti.Vector (riscv_zext_vl_oneuse (vti.Vector vti.RegClass:$rs1), (vti.Mask V0), VLOpFrag)),
+                  (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    def : Pat<(shl (wti.Vector (riscv_zext_vl_oneuse (vti.Vector vti.RegClass:$rs1), (vti.Mask VMV0:$vm), VLOpFrag)),
                    (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), 1, (XLenVT srcvalue)))),
               (!cast<Instruction>("PseudoVWADDU_VV_"#vti.LMul.MX#"_MASK")
                   (wti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.RegClass:$rs1,
-                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                  (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -1129,24 +1129,24 @@ defm : VPatWidenMulAddSDNode_VX<zext_oneuse, sext_oneuse, "PseudoVWMACCUS">;
 // 11.15. Vector Integer Merge Instructions
 foreach vti = AllIntegerVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (vselect (vti.Mask V0), vti.RegClass:$rs1,
+    def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), vti.RegClass:$rs1,
                                                         vti.RegClass:$rs2)),
               (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
                    (vti.Vector (IMPLICIT_DEF)),
-                   vti.RegClass:$rs2, vti.RegClass:$rs1, (vti.Mask V0),
+                   vti.RegClass:$rs2, vti.RegClass:$rs1, (vti.Mask VMV0:$vm),
                    vti.AVL, vti.Log2SEW)>;
 
-    def : Pat<(vti.Vector (vselect (vti.Mask V0), (SplatPat XLenVT:$rs1),
-                                                        vti.RegClass:$rs2)),
+    def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), (SplatPat XLenVT:$rs1),
+                                   vti.RegClass:$rs2)),
               (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
                    (vti.Vector (IMPLICIT_DEF)),
-                   vti.RegClass:$rs2, GPR:$rs1, (vti.Mask V0), vti.AVL, vti.Log2SEW)>;
+                   vti.RegClass:$rs2, GPR:$rs1, (vti.Mask VMV0:$vm), vti.AVL, vti.Log2SEW)>;
 
-    def : Pat<(vti.Vector (vselect (vti.Mask V0), (SplatPat_simm5 simm5:$rs1),
+    def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), (SplatPat_simm5 simm5:$rs1),
                                                         vti.RegClass:$rs2)),
               (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
                    (vti.Vector (IMPLICIT_DEF)),
-                   vti.RegClass:$rs2, simm5:$rs1, (vti.Mask V0), vti.AVL, vti.Log2SEW)>;
+                   vti.RegClass:$rs2, simm5:$rs1, (vti.Mask VMV0:$vm), vti.AVL, vti.Log2SEW)>;
   }
 }
 
@@ -1391,30 +1391,30 @@ defm : VPatFPSetCCSDNode_VV_VF_FV<SETOLE, "PseudoVMFLE", "PseudoVMFGE">;
 foreach fvti = AllFloatVectors in {
   defvar ivti = GetIntVTypeInfo<fvti>.Vti;
   let Predicates = GetVTypePredicates<ivti>.Predicates in {
-    def : Pat<(fvti.Vector (vselect (fvti.Mask V0), fvti.RegClass:$rs1,
+    def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1,
                                                           fvti.RegClass:$rs2)),
               (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
                    (fvti.Vector (IMPLICIT_DEF)),
-                   fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
+                   fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm),
                    fvti.AVL, fvti.Log2SEW)>;
 
-    def : Pat<(fvti.Vector (vselect (fvti.Mask V0),
+    def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
                                     (SplatFPOp (fvti.Scalar fpimm0)),
                                     fvti.RegClass:$rs2)),
               (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
                    (fvti.Vector (IMPLICIT_DEF)),
-                   fvti.RegClass:$rs2, 0, (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>;
+                   fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>;
 
   }
   let Predicates = GetVTypePredicates<fvti>.Predicates in 
-    def : Pat<(fvti.Vector (vselect (fvti.Mask V0),
+    def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
                                     (SplatFPOp fvti.ScalarRegClass:$rs1),
                                     fvti.RegClass:$rs2)),
               (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
                    (fvti.Vector (IMPLICIT_DEF)),
                    fvti.RegClass:$rs2,
                    (fvti.Scalar fvti.ScalarRegClass:$rs1),
-                   (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>;
+                   (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>;
 }
 
 // 13.17. Vector Single-Width Floating-Point/Integer Type-Convert Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index b721dcd9898854..2a1f4a644aa9bf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -622,7 +622,7 @@ class VPatBinaryVL_V<SDPatternOperator vop,
                        (op1_type op1_reg_class:$rs1),
                        (op2_type op2_reg_class:$rs2),
                        (result_type result_reg_class:$merge),
-                       (mask_type V0),
+                       (mask_type VMV0:$vm),
                        VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
@@ -631,7 +631,7 @@ class VPatBinaryVL_V<SDPatternOperator vop,
                    result_reg_class:$merge,
                    op1_reg_class:$rs1,
                    op2_reg_class:$rs2,
-                   (mask_type V0), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
+                   (mask_type VMV0:$vm), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
 
 class VPatBinaryVL_V_RM<SDPatternOperator vop,
                      string instruction_name,
@@ -650,7 +650,7 @@ class VPatBinaryVL_V_RM<SDPatternOperator vop,
                        (op1_type op1_reg_class:$rs1),
                        (op2_type op2_reg_class:$rs2),
                        (result_type result_reg_class:$merge),
-                       (mask_type V0),
+                       (mask_type VMV0:$vm),
                        VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
@@ -659,7 +659,7 @@ class VPatBinaryVL_V_RM<SDPatternOperator vop,
                    result_reg_class:$merge,
                    op1_reg_class:$rs1,
                    op2_reg_class:$rs2,
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
@@ -713,12 +713,12 @@ class VPatTiedBinaryMaskVL_V<SDNode vop,
                    (result_type result_reg_class:$rs1),
                    (op2_type op2_reg_class:$rs2),
                    (result_type result_reg_class:$rs1),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    VLOpFrag)),
       (!cast<Instruction>(instruction_name#"_"#suffix#"_"# vlmul.MX#"_MASK_TIED")
                    result_reg_class:$rs1,
                    op2_reg_class:$rs2,
-                   (mask_type V0), GPR:$vl, sew, TU_MU)>;
+                   (mask_type VMV0:$vm), GPR:$vl, sew, TU_MU)>;
 
 multiclass VPatTiedBinaryNoMaskVL_V_RM<SDNode vop,
                                        string instruction_name,
@@ -778,7 +778,7 @@ class VPatBinaryVL_XI<SDPatternOperator vop,
                    (vop1_type vop_reg_class:$rs1),
                    (vop2_type (SplatPatKind (XLenVT xop_kind:$rs2))),
                    (result_type result_reg_class:$merge),
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
@@ -787,7 +787,7 @@ class VPatBinaryVL_XI<SDPatternOperator vop,
                    result_reg_class:$merge,
                    vop_reg_class:$rs1,
                    xop_kind:$rs2,
-                   (mask_type V0), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
+                   (mask_type VMV0:$vm), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
 
 multiclass VPatBinaryVL_VV_VX<SDPatternOperator vop, string instruction_name,
                               list<VTypeInfo> vtilist = AllIntegerVectors,
@@ -902,7 +902,7 @@ class VPatBinaryVL_VF<SDPatternOperator vop,
     : Pat<(result_type (vop (vop1_type vop_reg_class:$rs1),
                        (vop2_type (SplatFPOp scalar_reg_class:$rs2)),
                        (result_type result_reg_class:$merge),
-                       (mask_type V0),
+                       (mask_type VMV0:$vm),
                        VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
@@ -911,7 +911,7 @@ class VPatBinaryVL_VF<SDPatternOperator vop,
                    result_reg_class:$merge,
                    vop_reg_class:$rs1,
                    scalar_reg_class:$rs2,
-                   (mask_type V0), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
+                   (mask_type VMV0:$vm), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
 
 class VPatBinaryVL_VF_RM<SDPatternOperator vop,
                       string instruction_name,
@@ -928,7 +928,7 @@ class VPatBinaryVL_VF_RM<SDPatternOperator vop,
     : Pat<(result_type (vop (vop1_type vop_reg_class:$rs1),
                        (vop2_type (SplatFPOp scalar_reg_class:$rs2)),
                        (result_type result_reg_class:$merge),
-                       (mask_type V0),
+                       (mask_type VMV0:$vm),
                        VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
@@ -937,7 +937,7 @@ class VPatBinaryVL_VF_RM<SDPatternOperator vop,
                    result_reg_class:$merge,
                    vop_reg_class:$rs1,
                    scalar_reg_class:$rs2,
-                   (mask_type V0),
+                   (mask_type VMV0:$vm),
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
@@ -982,7 +982,7 @@ multiclass VPatBinaryFPVL_R_VF<SDPatternOperator vop, string instruction_name,
     def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 fvti.RegClass:$rs1,
                                 (fvti.Vector fvti.RegClass:$merge),
-                                (fvti.Mask V0),
+                                (fvti.Mask VMV0:$vm),
                                 VLOpFrag)),
               (!cast<Instruction>(
                            !if(isSEWAware,
@@ -990,7 +990,7 @@ multiclass VPatBinaryFPVL_R_VF<SDPatternOperator vop, string instruction_name,
                                instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK"))
                            fvti.RegClass:$merge,
                            fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
-                           (fvti.Mask V0), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
+                           (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -1001,7 +1001,7 @@ multiclass VPatBinaryFPVL_R_VF_RM<SDPatternOperator vop, string instruction_name
     def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 fvti.RegClass:$rs1,
                                 (fvti.Vector fvti.RegClass:$merge),
-                                (fvti.Mask V0),
+                                (fvti.Mask VMV0:$vm),
                                 VLOpFrag)),
               (!cast<Instruction>(
                            !if(isSEWAware,
@@ -1009,7 +1009,7 @@ multiclass VPatBinaryFPVL_R_VF_RM<SDPatternOperator vop, string instruction_name
                                instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK"))
                            fvti.RegClass:$merge,
                            fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
-                           (fvti.Mask V0),
+                           (fvti.Mask VMV0:$vm),
                            // Value to indicate no rounding mode change in
                            // RISCVInsertReadWriteCSR
                            FRM_DYN,
@@ -1022,13 +1022,13 @@ multiclass VPatIntegerSetCCVL_VV<VTypeInfo vti, string instruction_name,
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       vti.RegClass:$rs2, cc,
                                       VR:$merge,
-                                      (vti.Mask V0),
+                                      (vti.Mask VMV0:$vm),
                                       VLOpFrag)),
             (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX#"_MASK")
                          VR:$merge,
                          vti.RegClass:$rs1,
                          vti.RegClass:$rs2,
-                         (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+                         (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
 }
 
 // Inherits from VPatIntegerSetCCVL_VV and adds a pattern with operands swapped.
@@ -1038,11 +1038,11 @@ multiclass VPatIntegerSetCCVL_VV_Swappable<VTypeInfo vti, string instruction_nam
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs2),
                                       vti.RegClass:$rs1, invcc,
                                       VR:$merge,
-                                      (vti.Mask V0),
+                                      (vti.Mask VMV0:$vm),
                                       VLOpFrag)),
             (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX#"_MASK")
                          VR:$merge, vti.RegClass:$rs1,
-                         vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+                         vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
 }
 
 multiclass VPatIntegerSetCCVL_VX_Swappable<VTypeInfo vti, string instruction_name,
@@ -1051,17 +1051,17 @@ multiclass VPatIntegerSetCCVL_VX_Swappable<VTypeInfo vti, string instruction_nam
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (SplatPat (XLenVT GPR:$rs2)), cc,
                                       VR:$merge,
-                                      (vti.Mask V0),
+                                      (vti.Mask VMV0:$vm),
                                       VLOpFrag)),
             (instruction_masked VR:$merge, vti.RegClass:$rs1,
-                                GPR:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+                                GPR:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
   def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat (XLenVT GPR:$rs2)),
                                       (vti.Vector vti.RegClass:$rs1), invcc,
                                       VR:$merge,
-                                      (vti.Mask V0),
+                                      (vti.Mask VMV0:$vm),
                                       VLOpFrag)),
             (instruction_masked VR:$merge, vti.RegClass:$rs1,
-                                GPR:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+                                GPR:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
 }
 
 multiclass VPatIntegerSetCCVL_VI_Swappable<VTypeInfo vti, string instruction_name,
@@ -1070,20 +1070,20 @@ multiclass VPatIntegerSetCCVL_VI_Swappable<VTypeInfo vti, string instruction_nam
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (SplatPat_simm5 simm5:$rs2), cc,
                                       VR:$merge,
-                                      (vti.Mask V0),
+                                      (vti.Mask VMV0:$vm),
                                       VLOpFrag)),
             (instruction_masked VR:$merge, vti.RegClass:$rs1,
-                                XLenVT:$rs2, (vti.Mask V0), GPR:$vl,
+                                XLenVT:$rs2, (vti.Mask VMV0:$vm), GPR:$vl,
                                 vti.Log2SEW)>;
 
   // FIXME: Can do some canonicalization to remove these patterns.
   def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat_simm5 simm5:$rs2),
                                       (vti.Vector vti.RegClass:$rs1), invcc,
                                       VR:$merge,
-                                      (vti.Mask V0),
+                                      (vti.Mask VMV0:$vm),
                                       VLOpFrag)),
             (instruction_masked VR:$merge, vti.RegClass:$rs1,
-                                simm5:$rs2, (vti.Mask V0), GPR:$vl,
+                                simm5:$rs2, (vti.Mask VMV0:$vm), GPR:$vl,
                                 vti.Log2SEW)>;
 }
 
@@ -1095,20 +1095,20 @@ multiclass VPatIntegerSetCCVL_VIPlus1_Swappable<VTypeInfo vti,
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (splatpat_kind simm5:$rs2), cc,
                                       VR:$merge,
-                                      (vti.Mask V0),
+                                      (vti.Mask VMV0:$vm),
                                       VLOpFrag)),
             (instruction_masked VR:$merge, vti.RegClass:$rs1,
-                                (DecImm simm5:$rs2), (vti.Mask V0), GPR:$vl,
+                                (DecImm simm5:$rs2), (vti.Mask VMV0:$vm), GPR:$vl,
                                 vti.Log2SEW)>;
 
   // FIXME: Can do some canonicalization to remove these patterns.
   def : Pat<(vti.Mask (riscv_setcc_vl (splatpat_kind simm5:$rs2),
                                       (vti.Vector vti.RegClass:$rs1), invcc,
                                       VR:$merge,
-                                      (vti.Mask V0),
+                                      (vti.Mask VMV0:$vm),
                                       VLOpFrag)),
             (instruction_masked VR:$merge, vti.RegClass:$rs1,
-                                (DecImm simm5:$rs2), (vti.Mask V0), GPR:$vl,
+                                (DecImm simm5:$rs2), (vti.Mask VMV0:$vm), GPR:$vl,
                                 vti.Log2SEW)>;
 }
 
@@ -1121,31 +1121,31 @@ multiclass VPatFPSetCCVL_VV_VF_FV<SDPatternOperator vop, CondCode cc,
                                  fvti.RegClass:$rs2,
                                  cc,
                                  VR:$merge,
-                                 (fvti.Mask V0),
+                                 (fvti.Mask VMV0:$vm),
                                  VLOpFrag)),
                 (!cast<Instruction>(inst_name#"_VV_"#fvti.LMul.MX#"_MASK")
                     VR:$merge, fvti.RegClass:$rs1,
-                    fvti.RegClass:$rs2, (fvti.Mask V0),
+                    fvti.RegClass:$rs2, (fvti.Mask VMV0:$vm),
                     GPR:$vl, fvti.Log2SEW)>;
       def : Pat<(fvti.Mask (vop (fvti.Vector fvti.RegClass:$rs1),
                                 (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 cc,
                                 VR:$merge,
-                                (fvti.Mask V0),
+                                (fvti.Mask VMV0:$vm),
                                 VLOpFrag)),
                 (!cast<Instruction>(inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK")
                     VR:$merge, fvti.RegClass:$rs1,
-                    fvti.ScalarRegClass:$rs2, (fvti.Mask V0),
+                    fvti.ScalarRegClass:$rs2, (fvti.Mask VMV0:$vm),
                     GPR:$vl, fvti.Log2SEW)>;
       def : Pat<(fvti.Mask (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 (fvti.Vector fvti.RegClass:$rs1),
                                 cc,
                                 VR:$merge,
-                                (fvti.Mask V0),
+                                (fvti.Mask VMV0:$vm),
                                 VLOpFrag)),
                 (!cast<Instruction>(swapped_op_inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK")
                     VR:$merge, fvti.RegClass:$rs1,
-                    fvti.ScalarRegClass:$rs2, (fvti.Mask V0),
+                    fvti.ScalarRegClass:$rs2, (fvti.Mask VMV0:$vm),
                     GPR:$vl, fvti.Log2SEW)>;
     }
   }
@@ -1159,11 +1159,11 @@ multiclass VPatExtendVL_V<SDNode vop, string inst_name, string suffix,
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                  GetVTypePredicates<fti>.Predicates) in
     def : Pat<(vti.Vector (vop (fti.Vector fti.RegClass:$rs2),
-                               (fti.Mask V0), VLOpFrag)),
+                               (fti.Mask VMV0:$vm), VLOpFrag)),
               (!cast<Instruction>(inst_name#"_"#suffix#"_"#vti.LMul.MX#"_MASK")
                   (vti.Vector (IMPLICIT_DEF)),
                   fti.RegClass:$rs2,
-                  (fti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>;
+                  (fti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
   }
 }
 
@@ -1175,11 +1175,11 @@ multiclass VPatConvertFP2IVL_V<SDPatternOperator vop, string instruction_name> {
     let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
                                  GetVTypePredicates<ivti>.Predicates) in
     def : Pat<(ivti.Vector (vop (fvti.Vector fvti.RegClass:$rs1),
-                                (fvti.Mask V0),
+                                (fvti.Mask VMV0:$vm),
                                 VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX#"_MASK")
                   (ivti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
-                  (fvti.Mask V0), GPR:$vl, ivti.Log2SEW, TA_MA)>;
+                  (fvti.Mask VMV0:$vm), GPR:$vl, ivti.Log2SEW, TA_MA)>;
   }
 }
 
@@ -1189,11 +1189,11 @@ multiclass VPatConvertFP2IVL_V_RM<SDPatternOperator vop, string instruction_name
     let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
                                  GetVTypePredicates<ivti>.Predicates) in
     def : Pat<(ivti.Vector (vop (fvti.Vector fvti.RegClass:$rs1),
-                                (fvti.Mask V0),
+                                (fvti.Mask VMV0:$vm),
                                 VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX#"_MASK")
                   (ivti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
-                  (fvti.Mask V0),
+                  (fvti.Mask VMV0:$vm),
                   // Value to indicate no rounding mode change in
                   // RISCVInsertReadWriteCSR
                   FRM_DYN,
@@ -1208,11 +1208,11 @@ multiclass VPatConvertFP2I_RM_VL_V<SDPatternOperator vop, string instruction_nam
     let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
                                  GetVTypePredicates<ivti>.Predicates) in
     def : Pat<(ivti.Vector (vop (fvti.Vector fvti.RegClass:$rs1),
-                                (fvti.Mask V0), (XLenVT timm:$frm),
+                                (fvti.Mask VMV0:$vm), (XLenVT timm:$frm),
                                 VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX#"_MASK")
                   (ivti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
-                  (fvti.Mask V0), timm:$frm, GPR:$vl, ivti.Log2SEW,
+                  (fvti.Mask VMV0:$vm), timm:$frm, GPR:$vl, ivti.Log2SEW,
                   TA_MA)>;
   }
 }
@@ -1223,11 +1223,11 @@ multiclass VPatConvertI2FPVL_V_RM<SDPatternOperator vop, string instruction_name
     let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
                                  GetVTypePredicates<ivti>.Predicates) in
     def : Pat<(fvti.Vector (vop (ivti.Vector ivti.RegClass:$rs1),
-                                (ivti.Mask V0),
+                                (ivti.Mask VMV0:$vm),
                                 VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
                   (fvti.Vector (IMPLICIT_DEF)), ivti.RegClass:$rs1,
-                  (ivti.Mask V0),
+                  (ivti.Mask VMV0:$vm),
                   // Value to indicate no rounding mode change in
                   // RISCVInsertReadWriteCSR
                   FRM_DYN,
@@ -1241,11 +1241,11 @@ multiclass VPatConvertI2FP_RM_VL_V<SDNode vop, string instruction_name> {
     let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
                                  GetVTypePredicates<ivti>.Predicates) in
     def : Pat<(fvti.Vector (vop (ivti.Vector ivti.RegClass:$rs1),
-                                (ivti.Mask V0), (XLenVT timm:$frm),
+                                (ivti.Mask VMV0:$vm), (XLenVT timm:$frm),
                                 VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
                   (fvti.Vector (IMPLICIT_DEF)), ivti.RegClass:$rs1,
-                  (ivti.Mask V0), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>;
+                  (ivti.Mask VMV0:$vm), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>;
   }
 }
 
@@ -1258,11 +1258,11 @@ multiclass VPatWConvertFP2IVL_V<SDPatternOperator vop, string instruction_name>
     let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
                                  GetVTypePredicates<iwti>.Predicates) in
     def : Pat<(iwti.Vector (vop (fvti.Vector fvti.RegClass:$rs1),
-                                (fvti.Mask V0),
+                                (fvti.Mask VMV0:$vm),
                                 VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
                   (iwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
-                  (fvti.Mask V0), GPR:$vl, fvti.Log2SEW, TA_MA)>;
+                  (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TA_MA)>;
   }
 }
 
@@ -1273,11 +1273,11 @@ multiclass VPatWConvertFP2IVL_V_RM<SDPatternOperator vop, string instruction_nam
     let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
                                  GetVTypePredicates<iwti>.Predicates) in
     def : Pat<(iwti.Vector (vop (fvti.Vector fvti.RegClass:$rs1),
-                                (fvti.Mask V0),
+                                (fvti.Mask VMV0:$vm),
                                 VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
                   (iwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
-                  (fvti.Mask V0),
+                  (fvti.Mask VMV0:$vm),
                   // Value to indicate no rounding mode change in
                   // RISCVInsertReadWriteCSR
                   FRM_DYN,
@@ -1293,11 +1293,11 @@ multiclass VPatWConvertFP2I_RM_VL_V<SDNode vop, string instruction_name> {
     let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
                                  GetVTypePredicates<iwti>.Predicates) in
     def : Pat<(iwti.Vector (vop (fvti.Vector fvti.RegClass:$rs1),
-                                (fvti.Mask V0), (XLenVT timm:$frm),
+                                (fvti.Mask VMV0:$vm), (XLenVT timm:$frm),
                                 VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
                   (iwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
-                  (fvti.Mask V0), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>;
+                  (fvti.Mask VMV0:$vm), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>;
   }
 }
 
@@ -1309,11 +1309,11 @@ multiclass VPatWConvertI2FPVL_V<SDPatternOperator vop,
     let Predicates = !listconcat(GetVTypePredicates<ivti>.Predicates,
                                  GetVTypePredicates<fwti>.Predicates) in
     def : Pat<(fwti.Vector (vop (ivti.Vector ivti.RegClass:$rs1),
-                                (ivti.Mask V0),
+                                (ivti.Mask VMV0:$vm),
                                 VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX#"_MASK")
                   (fwti.Vector (IMPLICIT_DEF)), ivti.RegClass:$rs1,
-                  (ivti.Mask V0),
+                  (ivti.Mask VMV0:$vm),
                   GPR:$vl, ivti.Log2SEW, TA_MA)>;
   }
 }
@@ -1330,11 +1330,11 @@ multiclass VPatNConvertFP2IVL_W<SDPatternOperator vop,
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                  GetVTypePredicates<fwti>.Predicates) in
     def : Pat<(vti.Vector (vop (fwti.Vector fwti.RegClass:$rs1),
-                               (fwti.Mask V0),
+                               (fwti.Mask VMV0:$vm),
                                VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#vti.LMul.MX#"_MASK")
                   (vti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
-                  (fwti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>;
+                  (fwti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
   }
 }
 
@@ -1348,11 +1348,11 @@ multiclass VPatNConvertFP2IVL_W_RM<SDPatternOperator vop,
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                  GetVTypePredicates<fwti>.Predicates) in
     def : Pat<(vti.Vector (vop (fwti.Vector fwti.RegClass:$rs1),
-                               (fwti.Mask V0),
+                               (fwti.Mask VMV0:$vm),
                                VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#vti.LMul.MX#"_MASK")
                   (vti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
-                  (fwti.Mask V0),
+                  (fwti.Mask VMV0:$vm),
                   // Value to indicate no rounding mode change in
                   // RISCVInsertReadWriteCSR
                   FRM_DYN,
@@ -1367,11 +1367,11 @@ multiclass VPatNConvertFP2I_RM_VL_W<SDNode vop, string instruction_name> {
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                  GetVTypePredicates<fwti>.Predicates) in
     def : Pat<(vti.Vector (vop (fwti.Vector fwti.RegClass:$rs1),
-                               (fwti.Mask V0), (XLenVT timm:$frm),
+                               (fwti.Mask VMV0:$vm), (XLenVT timm:$frm),
                                VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#vti.LMul.MX#"_MASK")
                   (vti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
-                  (fwti.Mask V0), timm:$frm, GPR:$vl, vti.Log2SEW, TA_MA)>;
+                  (fwti.Mask VMV0:$vm), timm:$frm, GPR:$vl, vti.Log2SEW, TA_MA)>;
   }
 }
 
@@ -1383,11 +1383,11 @@ multiclass VPatNConvertI2FPVL_W_RM<SDPatternOperator vop,
     let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
                                  GetVTypePredicates<iwti>.Predicates) in
     def : Pat<(fvti.Vector (vop (iwti.Vector iwti.RegClass:$rs1),
-                                (iwti.Mask V0),
+                                (iwti.Mask VMV0:$vm),
                                 VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
                   (fvti.Vector (IMPLICIT_DEF)), iwti.RegClass:$rs1,
-                  (iwti.Mask V0),
+                  (iwti.Mask VMV0:$vm),
                   // Value to indicate no rounding mode change in
                   // RISCVInsertReadWriteCSR
                   FRM_DYN,
@@ -1402,11 +1402,11 @@ multiclass VPatNConvertI2FP_RM_VL_W<SDNode vop, string instruction_name> {
     let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
                                  GetVTypePredicates<iwti>.Predicates) in
     def : Pat<(fvti.Vector (vop (iwti.Vector iwti.RegClass:$rs1),
-                                (iwti.Mask V0),  (XLenVT timm:$frm),
+                                (iwti.Mask VMV0:$vm),  (XLenVT timm:$frm),
                                 VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
                   (fvti.Vector (IMPLICIT_DEF)), iwti.RegClass:$rs1,
-                  (iwti.Mask V0), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>;
+                  (iwti.Mask VMV0:$vm), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>;
   }
 }
 
@@ -1416,13 +1416,13 @@ multiclass VPatReductionVL<SDNode vop, string instruction_name, bit is_float> {
     let Predicates = GetVTypePredicates<vti>.Predicates in {
       def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$merge),
                                    (vti.Vector vti.RegClass:$rs1), VR:$rs2,
-                                   (vti.Mask V0), VLOpFrag,
+                                   (vti.Mask VMV0:$vm), VLOpFrag,
                                    (XLenVT timm:$policy))),
           (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
               (vti_m1.Vector VR:$merge),
               (vti.Vector vti.RegClass:$rs1),
               (vti_m1.Vector VR:$rs2),
-              (vti.Mask V0), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
+              (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
     }
   }
 }
@@ -1433,13 +1433,13 @@ multiclass VPatReductionVL_RM<SDNode vop, string instruction_name, bit is_float>
     let Predicates = GetVTypePredicates<vti>.Predicates in {
       def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$merge),
                                    (vti.Vector vti.RegClass:$rs1), VR:$rs2,
-                                   (vti.Mask V0), VLOpFrag,
+                                   (vti.Mask VMV0:$vm), VLOpFrag,
                                    (XLenVT timm:$policy))),
           (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
               (vti_m1.Vector VR:$merge),
               (vti.Vector vti.RegClass:$rs1),
               (vti_m1.Vector VR:$rs2),
-              (vti.Mask V0),
+              (vti.Mask VMV0:$vm),
               // Value to indicate no rounding mode change in
               // RISCVInsertReadWriteCSR
               FRM_DYN,
@@ -1498,11 +1498,11 @@ multiclass VPatWidenReductionVL<SDNode vop, PatFrags extop, string instruction_n
                                  GetVTypePredicates<wti>.Predicates) in {
       def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
                                    (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))),
-                                   VR:$rs2, (vti.Mask V0), VLOpFrag,
+                                   VR:$rs2, (vti.Mask VMV0:$vm), VLOpFrag,
                                    (XLenVT timm:$policy))),
                (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
                   (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
-                  (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                  (wti_m1.Vector VR:$rs2), (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
                   (XLenVT timm:$policy))>;
     }
   }
@@ -1517,11 +1517,11 @@ multiclass VPatWidenReductionVL_RM<SDNode vop, PatFrags extop, string instructio
                                  GetVTypePredicates<wti>.Predicates) in {
       def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
                                    (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))),
-                                   VR:$rs2, (vti.Mask V0), VLOpFrag,
+                                   VR:$rs2, (vti.Mask VMV0:$vm), VLOpFrag,
                                    (XLenVT timm:$policy))),
                (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
                   (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
-                  (wti_m1.Vector VR:$rs2), (vti.Mask V0),
+                  (wti_m1.Vector VR:$rs2), (vti.Mask VMV0:$vm),
                   // Value to indicate no rounding mode change in
                   // RISCVInsertReadWriteCSR
                   FRM_DYN,
@@ -1540,11 +1540,11 @@ multiclass VPatWidenReductionVL_Ext_VL<SDNode vop, PatFrags extop, string instru
                                  GetVTypePredicates<wti>.Predicates) in {
       def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
                                    (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)),
-                                   VR:$rs2, (vti.Mask V0), VLOpFrag,
+                                   VR:$rs2, (vti.Mask VMV0:$vm), VLOpFrag,
                                    (XLenVT timm:$policy))),
                (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
                   (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
-                  (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                  (wti_m1.Vector VR:$rs2), (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
                   (XLenVT timm:$policy))>;
     }
   }
@@ -1559,11 +1559,11 @@ multiclass VPatWidenReductionVL_Ext_VL_RM<SDNode vop, PatFrags extop, string ins
                                  GetVTypePredicates<wti>.Predicates) in {
       def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
                                    (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)),
-                                   VR:$rs2, (vti.Mask V0), VLOpFrag,
+                                   VR:$rs2, (vti.Mask VMV0:$vm), VLOpFrag,
                                    (XLenVT timm:$policy))),
                (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
                   (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
-                  (wti_m1.Vector VR:$rs2), (vti.Mask V0),
+                  (wti_m1.Vector VR:$rs2), (vti.Mask VMV0:$vm),
                   // Value to indicate no rounding mode change in
                   // RISCVInsertReadWriteCSR
                   FRM_DYN,
@@ -1686,10 +1686,10 @@ multiclass VPatNarrowShiftExtVL_WV<SDNode op, PatFrags extop, string instruction
               (wti.Vector (extop (vti.Vector vti.RegClass:$rs1),
                                  (vti.Mask true_mask), VLOpFrag)),
           srcvalue, (vti.Mask true_mask), VLOpFrag),
-        (vti.Mask V0), VLOpFrag)),
+        (vti.Mask VMV0:$vm), VLOpFrag)),
       (!cast<Instruction>(instruction_name#"_WV_"#vti.LMul.MX#"_MASK")
         (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs2, vti.RegClass:$rs1,
-        (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>;
+        (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
   }
 }
 
@@ -1732,7 +1732,7 @@ multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> {
   foreach vti = AllIntegerVectors in {
   defvar suffix = vti.LMul.MX;
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
                 (vti.Vector (op vti.RegClass:$rd,
                                 (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2,
                                     srcvalue, (vti.Mask true_mask), VLOpFrag),
@@ -1740,8 +1740,8 @@ multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> {
                             vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>;
+    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
                 (vti.Vector (op vti.RegClass:$rd,
                                 (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2,
                                     srcvalue, (vti.Mask true_mask), VLOpFrag),
@@ -1749,8 +1749,8 @@ multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> {
                             vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>;
+    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
                 (vti.Vector (op vti.RegClass:$rd,
                                 (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2,
                                     srcvalue, (vti.Mask true_mask), VLOpFrag),
@@ -1758,8 +1758,8 @@ multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> {
                             vti.RegClass:$rd, undef, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
                 (vti.Vector (op vti.RegClass:$rd,
                                 (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2,
                                     srcvalue, (vti.Mask true_mask), VLOpFrag),
@@ -1767,7 +1767,7 @@ multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> {
                             vti.RegClass:$rd, undef, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     }
   }
 }
@@ -1781,17 +1781,17 @@ multiclass VPatWidenMultiplyAddVL_VV_VX<SDNode vwmacc_op, string instr_name> {
       def : Pat<(vwmacc_op (vti.Vector vti.RegClass:$rs1),
                            (vti.Vector vti.RegClass:$rs2),
                            (wti.Vector wti.RegClass:$rd),
-                           (vti.Mask V0), VLOpFrag),
+                           (vti.Mask VMV0:$vm), VLOpFrag),
                 (!cast<Instruction>(instr_name#"_VV_"#vti.LMul.MX#"_MASK")
                     wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                    (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
       def : Pat<(vwmacc_op (SplatPat XLenVT:$rs1),
                            (vti.Vector vti.RegClass:$rs2),
                            (wti.Vector wti.RegClass:$rd),
-                           (vti.Mask V0), VLOpFrag),
+                           (vti.Mask VMV0:$vm), VLOpFrag),
                 (!cast<Instruction>(instr_name#"_VX_"#vti.LMul.MX#"_MASK")
                     wti.RegClass:$rd, vti.ScalarRegClass:$rs1,
-                    vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                    vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
                     TAIL_AGNOSTIC)>;
     }
   }
@@ -1824,19 +1824,19 @@ multiclass VPatFPMulAddVL_VV_VF<SDPatternOperator vop, string instruction_name>
   defvar suffix = vti.LMul.MX;
   let Predicates = GetVTypePredicates<vti>.Predicates in {
     def : Pat<(vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rd,
-                               vti.RegClass:$rs2, (vti.Mask V0),
+                               vti.RegClass:$rs2, (vti.Mask VMV0:$vm),
                                VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
 
     def : Pat<(vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1),
                                vti.RegClass:$rd, vti.RegClass:$rs2,
-                               (vti.Mask V0),
+                               (vti.Mask VMV0:$vm),
                                VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
     }
   }
 }
@@ -1846,11 +1846,11 @@ multiclass VPatFPMulAddVL_VV_VF_RM<SDPatternOperator vop, string instruction_nam
   defvar suffix = vti.LMul.MX;
   let Predicates = GetVTypePredicates<vti>.Predicates in {
     def : Pat<(vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rd,
-                               vti.RegClass:$rs2, (vti.Mask V0),
+                               vti.RegClass:$rs2, (vti.Mask VMV0:$vm),
                                VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0),
+                   (vti.Mask VMV0:$vm),
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
@@ -1858,11 +1858,11 @@ multiclass VPatFPMulAddVL_VV_VF_RM<SDPatternOperator vop, string instruction_nam
 
     def : Pat<(vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1),
                                vti.RegClass:$rd, vti.RegClass:$rs2,
-                               (vti.Mask V0),
+                               (vti.Mask VMV0:$vm),
                                VLOpFrag)),
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0),
+                   (vti.Mask VMV0:$vm),
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
@@ -1875,34 +1875,34 @@ multiclass VPatFPMulAccVL_VV_VF<PatFrag vop, string instruction_name> {
   foreach vti = AllFloatVectors in {
   defvar suffix = vti.LMul.MX;
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
                            (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
                             vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>;
+    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
                            (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
                             vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>;
+    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
                            (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
                             vti.RegClass:$rd, undef, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
                            (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
                             vti.RegClass:$rd, undef, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     }
   }
 }
@@ -1911,46 +1911,46 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {
   foreach vti = AllFloatVectors in {
   defvar suffix = vti.LMul.MX;
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
                            (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
                             vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0),
+                   (vti.Mask VMV0:$vm),
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
                    GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
                            (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
                             vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0),
+                   (vti.Mask VMV0:$vm),
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
                    GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
                            (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
                             vti.RegClass:$rd, undef, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0),
+                   (vti.Mask VMV0:$vm),
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
                    GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
                            (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
                             vti.RegClass:$rd, undef, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0),
+                   (vti.Mask VMV0:$vm),
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
@@ -1967,18 +1967,18 @@ multiclass VPatWidenFPMulAccVL_VV_VF<SDNode vop, string instruction_name> {
                                  GetVTypePredicates<wti>.Predicates) in {
       def : Pat<(vop (vti.Vector vti.RegClass:$rs1),
                      (vti.Vector vti.RegClass:$rs2),
-                     (wti.Vector wti.RegClass:$rd), (vti.Mask V0),
+                     (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm),
                      VLOpFrag),
                 (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX #"_MASK")
                    wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
       def : Pat<(vop (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
                      (vti.Vector vti.RegClass:$rs2),
-                     (wti.Vector wti.RegClass:$rd), (vti.Mask V0),
+                     (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm),
                      VLOpFrag),
                 (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX #"_MASK")
                    wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
     }
   }
 }
@@ -1991,22 +1991,22 @@ multiclass VPatWidenFPMulAccVL_VV_VF_RM<SDNode vop, string instruction_name> {
                                  GetVTypePredicates<wti>.Predicates) in {
       def : Pat<(vop (vti.Vector vti.RegClass:$rs1),
                      (vti.Vector vti.RegClass:$rs2),
-                     (wti.Vector wti.RegClass:$rd), (vti.Mask V0),
+                     (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm),
                      VLOpFrag),
                 (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX #"_MASK")
                    wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0),
+                   (vti.Mask VMV0:$vm),
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
                    GPR:$vl, vti.Log2SEW, TA_MA)>;
       def : Pat<(vop (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
                      (vti.Vector vti.RegClass:$rs2),
-                     (wti.Vector wti.RegClass:$rd), (vti.Mask V0),
+                     (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm),
                      VLOpFrag),
                 (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX #"_MASK")
                    wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask V0),
+                   (vti.Mask VMV0:$vm),
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
@@ -2020,20 +2020,20 @@ multiclass VPatSlideVL_VX_VI<SDNode vop, string instruction_name> {
     let Predicates = GetVTypePredicates<vti>.Predicates in {
       def : Pat<(vti.Vector (vop (vti.Vector vti.RegClass:$rd),
                                  (vti.Vector vti.RegClass:$rs1),
-                                 uimm5:$rs2, (vti.Mask V0),
+                                 uimm5:$rs2, (vti.Mask VMV0:$vm),
                                  VLOpFrag, (XLenVT timm:$policy))),
                 (!cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX#"_MASK")
                     vti.RegClass:$rd, vti.RegClass:$rs1, uimm5:$rs2,
-                    (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                    (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
                     (XLenVT timm:$policy))>;
 
       def : Pat<(vti.Vector (vop (vti.Vector vti.RegClass:$rd),
                                  (vti.Vector vti.RegClass:$rs1),
-                                 GPR:$rs2, (vti.Mask V0),
+                                 GPR:$rs2, (vti.Mask VMV0:$vm),
                                  VLOpFrag, (XLenVT timm:$policy))),
                 (!cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX#"_MASK")
                     vti.RegClass:$rd, vti.RegClass:$rs1, GPR:$rs2,
-                    (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                    (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
                     (XLenVT timm:$policy))>;
     }
   }
@@ -2044,10 +2044,10 @@ multiclass VPatSlide1VL_VX<SDNode vop, string instruction_name> {
     let Predicates = GetVTypePredicates<vti>.Predicates in {
       def : Pat<(vti.Vector (vop (vti.Vector vti.RegClass:$rs3),
                                  (vti.Vector vti.RegClass:$rs1),
-                                 GPR:$rs2, (vti.Mask V0), VLOpFrag)),
+                                 GPR:$rs2, (vti.Mask VMV0:$vm), VLOpFrag)),
                 (!cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX#"_MASK")
                     vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
-                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
+                    (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>;
     }
   }
 }
@@ -2057,10 +2057,10 @@ multiclass VPatSlide1VL_VF<SDNode vop, string instruction_name> {
     let Predicates = GetVTypePredicates<vti>.Predicates in {
       def : Pat<(vti.Vector (vop (vti.Vector vti.RegClass:$rs3),
                                  (vti.Vector vti.RegClass:$rs1),
-                                 vti.Scalar:$rs2, (vti.Mask V0), VLOpFrag)),
+                                 vti.Scalar:$rs2, (vti.Mask VMV0:$vm), VLOpFrag)),
                 (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_MASK")
                     vti.RegClass:$rs3, vti.RegClass:$rs1, vti.Scalar:$rs2,
-                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
+                    (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>;
     }
   }
 }
@@ -2070,16 +2070,16 @@ multiclass VPatAVGADDVL_VV_VX_RM<SDNode vop, int vxrm> {
     let Predicates = GetVTypePredicates<vti>.Predicates in {
       def : Pat<(vop (vti.Vector vti.RegClass:$rs1),
                      (vti.Vector vti.RegClass:$rs2),
-                     vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                     vti.RegClass:$merge, (vti.Mask VMV0:$vm), VLOpFrag),
                 (!cast<Instruction>("PseudoVAADDU_VV_"#vti.LMul.MX#"_MASK")
                   vti.RegClass:$merge, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                  (vti.Mask V0), vxrm, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                  (vti.Mask VMV0:$vm), vxrm, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
       def : Pat<(vop (vti.Vector vti.RegClass:$rs1),
                      (vti.Vector (SplatPat (XLenVT GPR:$rs2))),
-                     vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                     vti.RegClass:$merge, (vti.Mask VMV0:$vm), VLOpFrag),
                 (!cast<Instruction>("PseudoVAADDU_VX_"#vti.LMul.MX#"_MASK")
                   vti.RegClass:$merge, vti.RegClass:$rs1, GPR:$rs2,
-                  (vti.Mask V0), vxrm, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                  (vti.Mask VMV0:$vm), vxrm, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     }
   }
 }
@@ -2099,16 +2099,16 @@ foreach vti = AllIntegerVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
     def : Pat<(riscv_sub_vl (vti.Vector (SplatPat (XLenVT GPR:$rs2))),
                             (vti.Vector vti.RegClass:$rs1),
-                            vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                            vti.RegClass:$merge, (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVRSUB_VX_"# vti.LMul.MX#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs1, GPR:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(riscv_sub_vl (vti.Vector (SplatPat_simm5 simm5:$rs2)),
                             (vti.Vector vti.RegClass:$rs1),
-                            vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                            vti.RegClass:$merge, (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVRSUB_VI_"# vti.LMul.MX#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs1, simm5:$rs2,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -2126,22 +2126,22 @@ foreach vtiToWti = AllWidenableIntVectors in {
                                GetVTypePredicates<wti>.Predicates) in {
     def : Pat<(riscv_shl_vl (wti.Vector (riscv_sext_vl_oneuse
                               (vti.Vector vti.RegClass:$rs1),
-                              (vti.Mask V0), VLOpFrag)),
+                              (vti.Mask VMV0:$vm), VLOpFrag)),
                             (wti.Vector (riscv_vmv_v_x_vl
                               (wti.Vector undef), 1, VLOpFrag)),
-                              wti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                              wti.RegClass:$merge, (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVWADD_VV_"#vti.LMul.MX#"_MASK")
                wti.RegClass:$merge, vti.RegClass:$rs1, vti.RegClass:$rs1,
-               (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+               (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(riscv_shl_vl (wti.Vector (riscv_zext_vl_oneuse
                               (vti.Vector vti.RegClass:$rs1),
-                              (vti.Mask V0), VLOpFrag)),
+                              (vti.Mask VMV0:$vm), VLOpFrag)),
                             (wti.Vector (riscv_vmv_v_x_vl
                               (wti.Vector undef), 1, VLOpFrag)),
-                              wti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                              wti.RegClass:$merge, (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVWADDU_VV_"#vti.LMul.MX#"_MASK")
                wti.RegClass:$merge, vti.RegClass:$rs1, vti.RegClass:$rs1,
-               (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+               (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -2202,11 +2202,11 @@ foreach vtiTowti = AllWidenableIntVectors in {
   let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                GetVTypePredicates<wti>.Predicates) in
   def : Pat<(vti.Vector (riscv_trunc_vector_vl (wti.Vector wti.RegClass:$rs1),
-                                               (vti.Mask V0),
+                                               (vti.Mask VMV0:$vm),
                                                VLOpFrag)),
             (!cast<Instruction>("PseudoVNSRL_WI_"#vti.LMul.MX#"_MASK")
                 (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
-                (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>;
+                (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
 }
 
 // 11.8. Vector Integer Comparison Instructions
@@ -2293,41 +2293,41 @@ foreach vtiTowti = AllWidenableIntVectors in {
   def : Pat<(riscv_vwmaccsu_vl (vti.Vector vti.RegClass:$rs1),
                                (SplatPat XLenVT:$rs2),
                                (wti.Vector wti.RegClass:$rd),
-                               (vti.Mask V0), VLOpFrag),
+                               (vti.Mask VMV0:$vm), VLOpFrag),
             (!cast<Instruction>("PseudoVWMACCUS_VX_"#vti.LMul.MX#"_MASK")
                 wti.RegClass:$rd, vti.ScalarRegClass:$rs2, vti.RegClass:$rs1,
-                (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
 // 11.15. Vector Integer Merge Instructions
 foreach vti = AllIntegerVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
+    def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask VMV0:$vm),
                                            vti.RegClass:$rs1,
                                            vti.RegClass:$rs2,
                                            vti.RegClass:$merge,
                                            VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
                   vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
-                  (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+                  (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
 
-    def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
+    def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask VMV0:$vm),
                                             (SplatPat XLenVT:$rs1),
                                             vti.RegClass:$rs2,
                                             vti.RegClass:$merge,
                                             VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
                   vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
-                  (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+                  (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
 
-    def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
+    def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask VMV0:$vm),
                                            (SplatPat_simm5 simm5:$rs1),
                                            vti.RegClass:$rs2,
                                            vti.RegClass:$merge,
                                            VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
                   vti.RegClass:$merge, vti.RegClass:$rs2, simm5:$rs1,
-                  (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+                  (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
   }
 }
 
@@ -2377,13 +2377,13 @@ class VPatTruncSatClipMaxMinBase<string inst,
           (wti.Vector (op2
             (wti.Vector wti.RegClass:$rs1),
             (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), op2_value, (XLenVT srcvalue))),
-            (wti.Vector undef),(wti.Mask V0), VLOpFrag)),
+            (wti.Vector undef),(wti.Mask VMV0:$vm), VLOpFrag)),
           (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), op1_value, (XLenVT srcvalue))),
-          (wti.Vector undef), (wti.Mask V0), VLOpFrag)),
-        (vti.Mask V0), VLOpFrag)),
+          (wti.Vector undef), (wti.Mask VMV0:$vm), VLOpFrag)),
+        (vti.Mask VMV0:$vm), VLOpFrag)),
       (!cast<Instruction>(inst#"_WI_"#vti.LMul.MX#"_MASK")
         (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
-        (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
+        (vti.Mask VMV0:$vm), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
 
 class VPatTruncSatClipUMin<VTypeInfo vti,
                            VTypeInfo wti,
@@ -2392,11 +2392,11 @@ class VPatTruncSatClipUMin<VTypeInfo vti,
         (wti.Vector (riscv_umin_vl
           (wti.Vector wti.RegClass:$rs1),
           (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), uminval, (XLenVT srcvalue))),
-          (wti.Vector undef), (wti.Mask V0), VLOpFrag)),
-        (vti.Mask V0), VLOpFrag)),
+          (wti.Vector undef), (wti.Mask VMV0:$vm), VLOpFrag)),
+        (vti.Mask VMV0:$vm), VLOpFrag)),
       (!cast<Instruction>("PseudoVNCLIPU_WI_"#vti.LMul.MX#"_MASK")
         (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
-        (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
+        (vti.Mask VMV0:$vm), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
 
 multiclass VPatTruncSatClipMaxMin<string inst, VTypeInfo vti, VTypeInfo wti,
   SDPatternOperator max, int maxval, SDPatternOperator min, int minval> {
@@ -2482,39 +2482,39 @@ defm : VPatFPSetCCVL_VV_VF_FV<any_riscv_fsetccs_vl, SETOLE,
 foreach vti = AllFloatVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
     // 13.8. Vector Floating-Point Square-Root Instruction
-    def : Pat<(any_riscv_fsqrt_vl (vti.Vector vti.RegClass:$rs2), (vti.Mask V0),
+    def : Pat<(any_riscv_fsqrt_vl (vti.Vector vti.RegClass:$rs2), (vti.Mask VMV0:$vm),
                               VLOpFrag),
               (!cast<Instruction>("PseudoVFSQRT_V_"# vti.LMul.MX # "_E" # vti.SEW # "_MASK")
                    (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs2,
-                   (vti.Mask V0),
+                   (vti.Mask VMV0:$vm),
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
                    GPR:$vl, vti.Log2SEW, TA_MA)>;
 
     // 13.12. Vector Floating-Point Sign-Injection Instructions
-    def : Pat<(riscv_fabs_vl (vti.Vector vti.RegClass:$rs), (vti.Mask V0),
+    def : Pat<(riscv_fabs_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm),
                              VLOpFrag),
               (!cast<Instruction>("PseudoVFSGNJX_VV_"# vti.LMul.MX #"_MASK")
                    (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs,
-                   vti.RegClass:$rs, (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                   vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
                    TA_MA)>;
     // Handle fneg with VFSGNJN using the same input for both operands.
-    def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask V0),
+    def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm),
                              VLOpFrag),
               (!cast<Instruction>("PseudoVFSGNJN_VV_"# vti.LMul.MX #"_MASK")
                    (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs,
-                   vti.RegClass:$rs, (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                   vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
                    TA_MA)>;
 
     def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
                                   (vti.Vector vti.RegClass:$rs2),
                                   vti.RegClass:$merge,
-                                  (vti.Mask V0),
+                                  (vti.Mask VMV0:$vm),
                                   VLOpFrag),
               (!cast<Instruction>("PseudoVFSGNJ_VV_"# vti.LMul.MX#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs1,
-                   vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                   vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
                    TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
@@ -2531,26 +2531,26 @@ foreach vti = AllFloatVectors in {
     def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
                                   (SplatFPOp vti.ScalarRegClass:$rs2),
                                   vti.RegClass:$merge,
-                                  (vti.Mask V0),
+                                  (vti.Mask VMV0:$vm),
                                   VLOpFrag),
               (!cast<Instruction>("PseudoVFSGNJ_V"#vti.ScalarSuffix#"_"# vti.LMul.MX#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs1,
-                   vti.ScalarRegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                   vti.ScalarRegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
                    TAIL_AGNOSTIC)>;
 
     // Rounding without exception to implement nearbyint.
     def : Pat<(any_riscv_vfround_noexcept_vl (vti.Vector vti.RegClass:$rs1),
-                                             (vti.Mask V0), VLOpFrag),
+                                             (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVFROUND_NOEXCEPT_V_" # vti.LMul.MX #"_MASK")
                     (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1,
-                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>;
+                    (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
 
     // 14.14. Vector Floating-Point Classify Instruction
     def : Pat<(riscv_fclass_vl (vti.Vector vti.RegClass:$rs2),
-                               (vti.Mask V0), VLOpFrag),
+                               (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVFCLASS_V_"# vti.LMul.MX #"_MASK")
                  (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs2, 
-                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>;
+                 (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
   }
 }
 
@@ -2560,37 +2560,37 @@ foreach fvti = AllFloatVectors in {
   // 13.15. Vector Floating-Point Merge Instruction
   defvar ivti = GetIntVTypeInfo<fvti>.Vti;
   let Predicates = GetVTypePredicates<ivti>.Predicates in {
-  def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
+  def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm),
                                           fvti.RegClass:$rs1,
                                           fvti.RegClass:$rs2,
                                           fvti.RegClass:$merge,
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
-                 fvti.RegClass:$merge, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
+                 fvti.RegClass:$merge, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm),
                  GPR:$vl, fvti.Log2SEW)>;
 
-  def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
+  def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm),
                                           (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))),
                                           fvti.RegClass:$rs2,
                                           fvti.RegClass:$merge,
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX)
-                 fvti.RegClass:$merge, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0),
+                 fvti.RegClass:$merge, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask VMV0:$vm),
                  GPR:$vl, fvti.Log2SEW)>;
 
 
-  def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
+  def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm),
                                           (SplatFPOp (fvti.Scalar fpimm0)),
                                           fvti.RegClass:$rs2,
                                           fvti.RegClass:$merge,
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
-                 fvti.RegClass:$merge, fvti.RegClass:$rs2, 0, (fvti.Mask V0),
+                 fvti.RegClass:$merge, fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm),
                  GPR:$vl, fvti.Log2SEW)>;
   }
 
   let Predicates = GetVTypePredicates<fvti>.Predicates in {
-    def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
+    def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm),
                                             (SplatFPOp fvti.ScalarRegClass:$rs1),
                                             fvti.RegClass:$rs2,
                                             fvti.RegClass:$merge,
@@ -2598,7 +2598,7 @@ foreach fvti = AllFloatVectors in {
               (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
                    fvti.RegClass:$merge, fvti.RegClass:$rs2,
                    (fvti.Scalar fvti.ScalarRegClass:$rs1),
-                   (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
+                   (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>;
   }
 }
 
@@ -2660,11 +2660,11 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
                                    GetVTypePredicates<fwti>.Predicates)) in
   def : Pat<(fwti.Vector (any_riscv_fpextend_vl
                              (fvti.Vector fvti.RegClass:$rs1),
-                             (fvti.Mask V0),
+                             (fvti.Mask VMV0:$vm),
                              VLOpFrag)),
             (!cast<Instruction>("PseudoVFWCVT_F_F_V_"#fvti.LMul.MX#"_MASK")
                 (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
-                (fvti.Mask V0),
+                (fvti.Mask VMV0:$vm),
                 GPR:$vl, fvti.Log2SEW, TA_MA)>;
 }
 
@@ -2692,10 +2692,10 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
                                    GetVTypePredicates<fwti>.Predicates)) in {
     def : Pat<(fvti.Vector (any_riscv_fpround_vl
                                (fwti.Vector fwti.RegClass:$rs1),
-                               (fwti.Mask V0), VLOpFrag)),
+                               (fwti.Mask VMV0:$vm), VLOpFrag)),
               (!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX#"_MASK")
                   (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
-                  (fwti.Mask V0),
+                  (fwti.Mask VMV0:$vm),
                   // Value to indicate no rounding mode change in
                   // RISCVInsertReadWriteCSR
                   FRM_DYN,
@@ -2705,10 +2705,10 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
                                GetVTypePredicates<fwti>.Predicates) in
     def : Pat<(fvti.Vector (any_riscv_fncvt_rod_vl
                                (fwti.Vector fwti.RegClass:$rs1),
-                               (fwti.Mask V0), VLOpFrag)),
+                               (fwti.Mask VMV0:$vm), VLOpFrag)),
               (!cast<Instruction>("PseudoVFNCVT_ROD_F_F_W_"#fvti.LMul.MX#"_MASK")
                   (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
-                  (fwti.Mask V0), GPR:$vl, fvti.Log2SEW, TA_MA)>;
+                  (fwti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TA_MA)>;
   }
 }
 
@@ -2812,20 +2812,20 @@ foreach mti = AllMasks in {
                                       VLOpFrag)),
               (!cast<Instruction>("PseudoVCPOP_M_" # mti.BX)
                    VR:$rs2, GPR:$vl, mti.Log2SEW)>;
-    def : Pat<(XLenVT (riscv_vcpop_vl (mti.Mask VR:$rs2), (mti.Mask V0),
+    def : Pat<(XLenVT (riscv_vcpop_vl (mti.Mask VR:$rs2), (mti.Mask VMV0:$vm),
                                       VLOpFrag)),
               (!cast<Instruction>("PseudoVCPOP_M_" # mti.BX # "_MASK")
-                   VR:$rs2, (mti.Mask V0), GPR:$vl, mti.Log2SEW)>;
+                   VR:$rs2, (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW)>;
 
     // 15.3 vfirst find-first-set mask bit
     def : Pat<(XLenVT (riscv_vfirst_vl (mti.Mask VR:$rs2), (mti.Mask true_mask),
                                       VLOpFrag)),
               (!cast<Instruction>("PseudoVFIRST_M_" # mti.BX)
                    VR:$rs2, GPR:$vl, mti.Log2SEW)>;
-    def : Pat<(XLenVT (riscv_vfirst_vl (mti.Mask VR:$rs2), (mti.Mask V0),
+    def : Pat<(XLenVT (riscv_vfirst_vl (mti.Mask VR:$rs2), (mti.Mask VMV0:$vm),
                                       VLOpFrag)),
               (!cast<Instruction>("PseudoVFIRST_M_" # mti.BX # "_MASK")
-                   VR:$rs2, (mti.Mask V0), GPR:$vl, mti.Log2SEW)>;
+                   VR:$rs2, (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW)>;
   }
 }
 
@@ -2848,26 +2848,26 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector (riscv_vrgather_vv_vl vti.RegClass:$rs2,
                                                 vti.RegClass:$rs1,
                                                 vti.RegClass:$merge,
-                                                (vti.Mask V0),
+                                                (vti.Mask VMV0:$vm),
                                                 VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_E"# vti.SEW#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1,
                                                 vti.RegClass:$merge,
-                                                (vti.Mask V0),
+                                                (vti.Mask VMV0:$vm),
                                                 VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VX_"# vti.LMul.MX#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2,
                                                 uimm5:$imm,
                                                 vti.RegClass:$merge,
-                                                (vti.Mask V0),
+                                                (vti.Mask VMV0:$vm),
                                                 VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 
   // emul = lmul * 16 / sew
@@ -2883,11 +2883,11 @@ foreach vti = AllIntegerVectors in {
                (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2,
                                          (ivti.Vector ivti.RegClass:$rs1),
                                          vti.RegClass:$merge,
-                                         (vti.Mask V0),
+                                         (vti.Mask VMV0:$vm),
                                          VLOpFrag)),
               (!cast<Instruction>(inst#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -2921,27 +2921,27 @@ foreach vti = AllFloatVectors in {
                (riscv_vrgather_vv_vl vti.RegClass:$rs2,
                                      (ivti.Vector vti.RegClass:$rs1),
                                      vti.RegClass:$merge,
-                                     (vti.Mask V0),
+                                     (vti.Mask VMV0:$vm),
                                      VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_E"# vti.SEW#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1,
                                                 vti.RegClass:$merge,
-                                                (vti.Mask V0),
+                                                (vti.Mask VMV0:$vm),
                                                 VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VX_"# vti.LMul.MX#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(vti.Vector
                (riscv_vrgather_vx_vl vti.RegClass:$rs2,
                                      uimm5:$imm,
                                      vti.RegClass:$merge,
-                                     (vti.Mask V0),
+                                     (vti.Mask VMV0:$vm),
                                      VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 
   defvar vlmul = vti.LMul;
@@ -2957,11 +2957,11 @@ foreach vti = AllFloatVectors in {
                (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2,
                                          (ivti.Vector ivti.RegClass:$rs1),
                                          vti.RegClass:$merge,
-                                         (vti.Mask V0),
+                                         (vti.Mask VMV0:$vm),
                                          VLOpFrag)),
               (!cast<Instruction>(inst#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -2998,10 +2998,10 @@ def riscv_fslide1down_vl  : SDNode<"RISCVISD::VFSLIDE1DOWN_VL", SDTRVVFSlide1, [
 
 foreach vti = AllIntegerVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (riscv_vid_vl (vti.Mask V0),
+    def : Pat<(vti.Vector (riscv_vid_vl (vti.Mask VMV0:$vm),
                                         VLOpFrag)),
               (!cast<Instruction>("PseudoVID_V_"#vti.LMul.MX#"_MASK")
-                  (vti.Vector (IMPLICIT_DEF)), (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                  (vti.Vector (IMPLICIT_DEF)), (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
                   TAIL_AGNOSTIC)>;
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 575f9b41accb51..2d72e98438ec16 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -537,12 +537,12 @@ multiclass VPatUnaryVL_V<SDPatternOperator op, string instruction_name,
                                  GetVTypePredicates<vti>.Predicates) in {
       def : Pat<(vti.Vector (op (vti.Vector vti.RegClass:$rs1),
                                 (vti.Vector vti.RegClass:$merge),
-                                (vti.Mask V0),
+                                (vti.Mask VMV0:$vm),
                                 VLOpFrag)),
                 (!cast<Instruction>(instruction_name#"_V_"#vti.LMul.MX#"_MASK")
                    vti.RegClass:$merge,
                    vti.RegClass:$rs1,
-                   (vti.Mask V0),
+                   (vti.Mask VMV0:$vm),
                    GPR:$vl,
                    vti.Log2SEW,
                    TAIL_AGNOSTIC)>;
@@ -557,17 +557,17 @@ foreach vti = AllIntegerVectors in {
                                            (vti.Vector vti.RegClass:$rs1),
                                            (riscv_splat_vector -1),
                                            (vti.Vector vti.RegClass:$merge),
-                                           (vti.Mask V0),
+                                           (vti.Mask VMV0:$vm),
                                            VLOpFrag),
                                         (vti.Vector vti.RegClass:$rs2),
                                         (vti.Vector vti.RegClass:$merge),
-                                        (vti.Mask V0),
+                                        (vti.Mask VMV0:$vm),
                                         VLOpFrag)),
               (!cast<Instruction>("PseudoVANDN_VV_"#vti.LMul.MX#"_MASK")
                  vti.RegClass:$merge,
                  vti.RegClass:$rs2,
                  vti.RegClass:$rs1,
-                 (vti.Mask V0),
+                 (vti.Mask VMV0:$vm),
                  GPR:$vl,
                  vti.Log2SEW,
                  TAIL_AGNOSTIC)>;
@@ -576,13 +576,13 @@ foreach vti = AllIntegerVectors in {
                                            (not vti.ScalarRegClass:$rs1)),
                                         (vti.Vector vti.RegClass:$rs2),
                                         (vti.Vector vti.RegClass:$merge),
-                                        (vti.Mask V0),
+                                        (vti.Mask VMV0:$vm),
                                         VLOpFrag)),
               (!cast<Instruction>("PseudoVANDN_VX_"#vti.LMul.MX#"_MASK")
                  vti.RegClass:$merge,
                  vti.RegClass:$rs2,
                  vti.ScalarRegClass:$rs1,
-                 (vti.Mask V0),
+                 (vti.Mask VMV0:$vm),
                  GPR:$vl,
                  vti.Log2SEW,
                  TAIL_AGNOSTIC)>;
@@ -604,12 +604,12 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(riscv_rotl_vl vti.RegClass:$rs2,
                              (vti.Vector (SplatPat_uimm6 uimm6:$rs1)),
                              (vti.Vector vti.RegClass:$merge),
-                             (vti.Mask V0), VLOpFrag),
+                             (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVROR_VI_"#vti.LMul.MX#"_MASK")
                  vti.RegClass:$merge,
                  vti.RegClass:$rs2,
                  (!cast<SDNodeXForm>("InvRot" # vti.SEW # "Imm") uimm6:$rs1),
-                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                 (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 defm : VPatBinaryVL_VV_VX_VI<riscv_rotr_vl, "PseudoVROR", uimm6>;
@@ -624,90 +624,90 @@ foreach vtiToWti = AllWidenableIntVectors in {
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (ext_oneuse (vti.Vector vti.RegClass:$rs1))),
                  (wti.Vector wti.RegClass:$merge),
-                 (vti.Mask V0), VLOpFrag),
+                 (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
                  wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
-                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                 (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
                  (wti.Vector (riscv_zext_vl_oneuse
                                 (vti.Vector vti.RegClass:$rs2),
-                                (vti.Mask V0), VLOpFrag)),
+                                (vti.Mask VMV0:$vm), VLOpFrag)),
                  (wti.Vector (riscv_ext_vl_oneuse
                                 (vti.Vector vti.RegClass:$rs1),
-                                (vti.Mask V0), VLOpFrag)),
+                                (vti.Mask VMV0:$vm), VLOpFrag)),
                  (wti.Vector wti.RegClass:$merge),
-                 (vti.Mask V0), VLOpFrag),
+                 (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
                  wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
-                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                 (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
                  (wti.Vector wti.RegClass:$merge),
-                 (vti.Mask V0), VLOpFrag),
+                 (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
                  wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
-                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                 (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
                  (wti.Vector (riscv_zext_vl_oneuse
                                 (vti.Vector vti.RegClass:$rs2),
-                                (vti.Mask V0), VLOpFrag)),
+                                (vti.Mask VMV0:$vm), VLOpFrag)),
                  (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
                  (wti.Vector wti.RegClass:$merge),
-                 (vti.Mask V0), VLOpFrag),
+                 (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
                  wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
-                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                 (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
                  (wti.Vector wti.RegClass:$merge),
-                 (vti.Mask V0), VLOpFrag),
+                 (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
                  wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
-                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                 (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
                  (wti.Vector (riscv_zext_vl_oneuse
                                 (vti.Vector vti.RegClass:$rs2),
-                                (vti.Mask V0), VLOpFrag)),
+                                (vti.Mask VMV0:$vm), VLOpFrag)),
                  (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
                  (wti.Vector wti.RegClass:$merge),
-                 (vti.Mask V0), VLOpFrag),
+                 (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
                  wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
-                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                 (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_vwsll_vl
                  (vti.Vector vti.RegClass:$rs2),
                  (vti.Vector vti.RegClass:$rs1),
                  (wti.Vector wti.RegClass:$merge),
-                 (vti.Mask V0), VLOpFrag),
+                 (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
                  wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
-                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                 (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_vwsll_vl
                  (vti.Vector vti.RegClass:$rs2),
                  (vti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
                  (wti.Vector wti.RegClass:$merge),
-                 (vti.Mask V0), VLOpFrag),
+                 (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
                  wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
-                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                 (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_vwsll_vl
                  (vti.Vector vti.RegClass:$rs2),
                  (vti.Vector (SplatPat_uimm5 uimm5:$rs1)),
                  (wti.Vector wti.RegClass:$merge),
-                 (vti.Mask V0), VLOpFrag),
+                 (vti.Mask VMV0:$vm), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
                  wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
-                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+                 (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
@@ -851,12 +851,12 @@ multiclass VPatBinaryV_VI_VROL<string intrinsic, string instruction,
     def : Pat<(vti.Vector (IntrMask (vti.Vector vti.RegClass:$merge),
                           (vti.Vector vti.RegClass:$rs2),
                           (XLenVT uimm6:$rs1),
-                          (vti.Mask V0),
+                          (vti.Mask VMV0:$vm),
                           VLOpFrag, (XLenVT timm:$policy))),
                           (PseudoMask (vti.Vector vti.RegClass:$merge),
                           (vti.Vector vti.RegClass:$rs2),
                           (InvRot64Imm uimm6:$rs1),
-                          (vti.Mask V0),
+                          (vti.Mask VMV0:$vm),
                           GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
     }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 7e04e9154b524e..146ac5700db04d 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -132,6 +132,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
 
   const TargetRegisterClass *
   getLargestSuperClass(const TargetRegisterClass *RC) const override {
+    if (RISCV::VMV0RegClass.hasSubClassEq(RC))
+      return &RISCV::VMV0RegClass;
     if (RISCV::VRM8RegClass.hasSubClassEq(RC))
       return &RISCV::VRM8RegClass;
     if (RISCV::VRM4RegClass.hasSubClassEq(RC))
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 316daf2763ca1e..66ed3f51662cac 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -549,7 +549,7 @@ def VRM8 : VReg<VM8VTs, (add V8M8, V16M8, V24M8, V0M8), 8>;
 
 def VRM8NoV0 : VReg<VM8VTs, (sub VRM8, V0M8), 8>;
 
-def VMV0 : VReg<VMaskVTs, (add V0), 1>;
+def VMV0 : VReg<VMaskVTs, (add V0, V0, V0, V0, V0), 1>;
 
 let RegInfos = XLenRI in {
 def GPRF16  : RISCVRegisterClass<[f16], 16, (add GPR)>;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
index 42bf3212287053..f8061462c62207 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
@@ -10,20 +10,18 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv1i8
-    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
-    ; RV32I-NEXT: $v0 = COPY [[DEF]]
-    ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 3 /* e8 */
+    ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
     ; RV32I-NEXT: PseudoRET implicit $v8
     ;
     ; RV64I-LABEL: name: select_nxv1i8
-    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
-    ; RV64I-NEXT: $v0 = COPY [[DEF]]
-    ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 3 /* e8 */
+    ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
     ; RV64I-NEXT: PseudoRET implicit $v8
     %0:vrb(<vscale x 2 x s1>) = G_IMPLICIT_DEF
@@ -41,20 +39,18 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv4i8
-    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
-    ; RV32I-NEXT: $v0 = COPY [[DEF]]
-    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 3 /* e8 */
+    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
     ; RV32I-NEXT: PseudoRET implicit $v8
     ;
     ; RV64I-LABEL: name: select_nxv4i8
-    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
-    ; RV64I-NEXT: $v0 = COPY [[DEF]]
-    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 3 /* e8 */
+    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
     ; RV64I-NEXT: PseudoRET implicit $v8
     %0:vrb(<vscale x 8 x s1>) = G_IMPLICIT_DEF
@@ -72,20 +68,18 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv16i8
-    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF
-    ; RV32I-NEXT: $v0 = COPY [[DEF]]
-    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 3 /* e8 */
+    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV32I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]]
     ; RV32I-NEXT: PseudoRET implicit $v8m4
     ;
     ; RV64I-LABEL: name: select_nxv16i8
-    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF
-    ; RV64I-NEXT: $v0 = COPY [[DEF]]
-    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 3 /* e8 */
+    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV64I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]]
     ; RV64I-NEXT: PseudoRET implicit $v8m4
     %0:vrb(<vscale x 32 x s1>) = G_IMPLICIT_DEF
@@ -103,20 +97,18 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv64i8
-    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
-    ; RV32I-NEXT: $v0 = COPY [[DEF]]
-    ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 4 /* e16 */
+    ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
     ; RV32I-NEXT: PseudoRET implicit $v8
     ;
     ; RV64I-LABEL: name: select_nxv64i8
-    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
-    ; RV64I-NEXT: $v0 = COPY [[DEF]]
-    ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 4 /* e16 */
+    ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
     ; RV64I-NEXT: PseudoRET implicit $v8
     %0:vrb(<vscale x 1 x s1>) = G_IMPLICIT_DEF
@@ -134,20 +126,18 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv2i16
-    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
-    ; RV32I-NEXT: $v0 = COPY [[DEF]]
-    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 4 /* e16 */
+    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
     ; RV32I-NEXT: PseudoRET implicit $v8
     ;
     ; RV64I-LABEL: name: select_nxv2i16
-    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
-    ; RV64I-NEXT: $v0 = COPY [[DEF]]
-    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 4 /* e16 */
+    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
     ; RV64I-NEXT: PseudoRET implicit $v8
     %0:vrb(<vscale x 4 x s1>) = G_IMPLICIT_DEF
@@ -165,20 +155,18 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv8i16
-    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF
-    ; RV32I-NEXT: $v0 = COPY [[DEF]]
-    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 4 /* e16 */
+    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV32I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]]
     ; RV32I-NEXT: PseudoRET implicit $v8m4
     ;
     ; RV64I-LABEL: name: select_nxv8i16
-    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF
-    ; RV64I-NEXT: $v0 = COPY [[DEF]]
-    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 4 /* e16 */
+    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV64I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]]
     ; RV64I-NEXT: PseudoRET implicit $v8m4
     %0:vrb(<vscale x 16 x s1>) = G_IMPLICIT_DEF
@@ -196,20 +184,18 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv32i16
-    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
-    ; RV32I-NEXT: $v0 = COPY [[DEF]]
-    ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 5 /* e32 */
+    ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]]
     ; RV32I-NEXT: PseudoRET implicit $v8
     ;
     ; RV64I-LABEL: name: select_nxv32i16
-    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
-    ; RV64I-NEXT: $v0 = COPY [[DEF]]
-    ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 5 /* e32 */
+    ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]]
     ; RV64I-NEXT: PseudoRET implicit $v8
     %0:vrb(<vscale x 1 x s1>) = G_IMPLICIT_DEF
@@ -227,20 +213,18 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv2i32
-    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF
-    ; RV32I-NEXT: $v0 = COPY [[DEF]]
-    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 5 /* e32 */
+    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
     ; RV32I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]]
     ; RV32I-NEXT: PseudoRET implicit $v8m2
     ;
     ; RV64I-LABEL: name: select_nxv2i32
-    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF
-    ; RV64I-NEXT: $v0 = COPY [[DEF]]
-    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 5 /* e32 */
+    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
     ; RV64I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]]
     ; RV64I-NEXT: PseudoRET implicit $v8m2
     %0:vrb(<vscale x 4 x s1>) = G_IMPLICIT_DEF
@@ -258,20 +242,18 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv8i32
-    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
-    ; RV32I-NEXT: $v0 = COPY [[DEF]]
-    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 5 /* e32 */
+    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
     ; RV32I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]]
     ; RV32I-NEXT: PseudoRET implicit $v8m8
     ;
     ; RV64I-LABEL: name: select_nxv8i32
-    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
-    ; RV64I-NEXT: $v0 = COPY [[DEF]]
-    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 5 /* e32 */
+    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
     ; RV64I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]]
     ; RV64I-NEXT: PseudoRET implicit $v8m8
     %0:vrb(<vscale x 16 x s1>) = G_IMPLICIT_DEF
@@ -289,20 +271,18 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv1i64
-    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF
-    ; RV32I-NEXT: $v0 = COPY [[DEF]]
-    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 6 /* e64 */
+    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */
     ; RV32I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]]
     ; RV32I-NEXT: PseudoRET implicit $v8m2
     ;
     ; RV64I-LABEL: name: select_nxv1i64
-    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF
-    ; RV64I-NEXT: $v0 = COPY [[DEF]]
-    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 6 /* e64 */
+    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */
     ; RV64I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]]
     ; RV64I-NEXT: PseudoRET implicit $v8m2
     %0:vrb(<vscale x 2 x s1>) = G_IMPLICIT_DEF
@@ -320,20 +300,18 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv4i64
-    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
-    ; RV32I-NEXT: $v0 = COPY [[DEF]]
-    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 6 /* e64 */
+    ; RV32I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */
     ; RV32I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]]
     ; RV32I-NEXT: PseudoRET implicit $v8m8
     ;
     ; RV64I-LABEL: name: select_nxv4i64
-    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
-    ; RV64I-NEXT: $v0 = COPY [[DEF]]
-    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 6 /* e64 */
+    ; RV64I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */
     ; RV64I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]]
     ; RV64I-NEXT: PseudoRET implicit $v8m8
     %0:vrb(<vscale x 8 x s1>) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
index 83a4f63add337f..075678a6655b5e 100644
--- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
+++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
@@ -24,31 +24,31 @@ define void @_Z3foov() {
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_49)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_49)
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_48)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_48)
-; CHECK-NEXT:    vle8.v v10, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_46)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_46)
-; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vle16.v v12, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_45)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_45)
-; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vle16.v v14, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v12, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v14, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vs2r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index 65d0768c60885d..c3f91d23e8a1b4 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -108,15 +108,15 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vmv.v.x v24, a0
-; RV64-NEXT:    vid.v v16
+; RV64-NEXT:    vmv.v.x v16, a0
+; RV64-NEXT:    vid.v v24
 ; RV64-NEXT:    li a1, -1
-; RV64-NEXT:    vmadd.vx v16, a1, v24
+; RV64-NEXT:    vmadd.vx v24, a1, v16
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV64-NEXT:    vmerge.vvm v8, v8, v24, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/abd.ll b/llvm/test/CodeGen/RISCV/rvv/abd.ll
index ddbfbd0b59fa4b..affbae9c35cdb1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abd.ll
@@ -26,13 +26,13 @@ define <vscale x 16 x i8> @sabd_b_promoted_ops(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-LABEL: sabd_b_promoted_ops:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmerge.vim v12, v10, -1, v0
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmerge.vim v10, v12, -1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, -1, v0
-; CHECK-NEXT:    vmin.vv v10, v12, v8
-; CHECK-NEXT:    vmax.vv v8, v12, v8
-; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vmerge.vim v8, v12, -1, v0
+; CHECK-NEXT:    vmin.vv v12, v10, v8
+; CHECK-NEXT:    vmax.vv v8, v10, v8
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %a.sext = sext <vscale x 16 x i1> %a to <vscale x 16 x i8>
   %b.sext = sext <vscale x 16 x i1> %b to <vscale x 16 x i8>
@@ -158,13 +158,13 @@ define <vscale x 16 x i8> @uabd_b_promoted_ops(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-LABEL: uabd_b_promoted_ops:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmerge.vim v10, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vminu.vv v10, v12, v8
-; CHECK-NEXT:    vmaxu.vv v8, v12, v8
-; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT:    vminu.vv v12, v10, v8
+; CHECK-NEXT:    vmaxu.vv v8, v10, v8
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 16 x i1> %a to <vscale x 16 x i8>
   %b.zext = zext <vscale x 16 x i1> %b to <vscale x 16 x i8>
diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
index eb74e2d302f1a8..698ecbf225a92d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
@@ -561,50 +561,26 @@ declare <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64>, i1 immarg
 define <vscale x 16 x i64> @vp_abs_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_abs_nxv16i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vrsub.vi v8, v16, 0, v0.t
-; CHECK-NEXT:    vmax.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vrsub.vi v24, v16, 0, v0.t
+; CHECK-NEXT:    vmax.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB46_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB46_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
-; CHECK-NEXT:    vmax.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vrsub.vi v24, v8, 0, v0.t
+; CHECK-NEXT:    vmax.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> %va, i1 false, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 879dff4a6e490b..b446175e591d2e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -2016,29 +2016,29 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64(<vscale x 4 x i64> %va, <vscale
 ; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v12, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v20, v12, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    vsetvli a6, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a5), zero
+; RV32-NEXT:    vlse64.v v20, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v20, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
-; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
-; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v20, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
@@ -2313,7 +2313,9 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    vsetvli a6, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
@@ -2323,36 +2325,38 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 4
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 4
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v24, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a4, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2419,27 +2423,27 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    lui a3, 16
 ; RV64-NEXT:    addiw a3, a3, -256
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsll.vx v24, v24, a4, v0.t
+; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    addi a5, sp, 16
 ; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -2671,7 +2675,9 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    vsetvli a6, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
@@ -2681,36 +2687,38 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 4
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 4
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v24, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a4, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2777,27 +2785,27 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    lui a3, 16
 ; RV64-NEXT:    addiw a3, a3, -256
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsll.vx v24, v24, a4, v0.t
+; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    addi a5, sp, 16
 ; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -3004,97 +3012,73 @@ declare <vscale x 64 x i16> @llvm.vp.bitreverse.nxv64i16(<vscale x 64 x i16>, <v
 define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_bitreverse_nxv64i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    srli a1, a2, 1
+; CHECK-NEXT:    srli a2, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    sub a1, a0, a2
-; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a1, a3, a1
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v16, 8, v0.t
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    vsrl.vi v24, v16, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
-; CHECK-NEXT:    vor.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v16, 4, v0.t
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    addi a2, a2, -241
+; CHECK-NEXT:    vand.vx v24, v24, a2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
 ; CHECK-NEXT:    vsll.vi v16, v16, 4, v0.t
-; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v16, 2, v0.t
+; CHECK-NEXT:    vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 2, v0.t
 ; CHECK-NEXT:    lui a3, 3
 ; CHECK-NEXT:    addi a3, a3, 819
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a3, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
 ; CHECK-NEXT:    vsll.vi v16, v16, 2, v0.t
-; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; CHECK-NEXT:    vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; CHECK-NEXT:    lui a4, 5
 ; CHECK-NEXT:    addi a4, a4, 1365
-; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a4, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
 ; CHECK-NEXT:    vsll.vi v16, v16, 1, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    addi a5, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    bltu a0, a2, .LBB46_2
+; CHECK-NEXT:    vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    bltu a0, a1, .LBB46_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB46_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a2, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
+; CHECK-NEXT:    vor.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 2, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a3, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
+; CHECK-NEXT:    vor.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a4, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vor.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_bitreverse_nxv64i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 1
 ; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vmv1r.v v25, v0
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-ZVBB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
@@ -3108,7 +3092,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
 ; CHECK-ZVBB-NEXT:    mv a0, a1
 ; CHECK-ZVBB-NEXT:  .LBB46_2:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vmv1r.v v0, v24
+; CHECK-ZVBB-NEXT:    vmv1r.v v0, v25
 ; CHECK-ZVBB-NEXT:    vbrev.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 64 x i16> @llvm.vp.bitreverse.nxv64i16(<vscale x 64 x i16> %va, <vscale x 64 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 800d06c5a78f54..7fbdfb30214087 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -864,29 +864,29 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v12, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v20, v12, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    vsetvli a6, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a5), zero
+; RV32-NEXT:    vlse64.v v20, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v20, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
-; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
-; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v20, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
@@ -1047,7 +1047,9 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    vsetvli a6, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
@@ -1057,36 +1059,38 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v24, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a4, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1123,27 +1127,27 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    lui a3, 16
 ; RV64-NEXT:    addiw a3, a3, -256
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsll.vx v24, v24, a4, v0.t
+; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    addi a5, sp, 16
 ; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -1290,7 +1294,9 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    vsetvli a6, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
@@ -1300,36 +1306,38 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v24, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a4, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1366,27 +1374,27 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    lui a3, 16
 ; RV64-NEXT:    addiw a3, a3, -256
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsll.vx v24, v24, a4, v0.t
+; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    addi a5, sp, 16
 ; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -1508,21 +1516,10 @@ declare <vscale x 64 x i16> @llvm.vp.bswap.nxv64i16(<vscale x 64 x i16>, <vscale
 define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_bswap_nxv64i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -1530,39 +1527,26 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v16, 8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vor.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vp_bswap_nxv64i16:
 ; CHECK-ZVKB:       # %bb.0:
-; CHECK-ZVKB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVKB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVKB-NEXT:    srli a2, a1, 1
 ; CHECK-ZVKB-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vmv1r.v v25, v0
 ; CHECK-ZVKB-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-ZVKB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVKB-NEXT:    sub a2, a0, a1
@@ -1576,7 +1560,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
 ; CHECK-ZVKB-NEXT:    mv a0, a1
 ; CHECK-ZVKB-NEXT:  .LBB32_2:
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVKB-NEXT:    vmv1r.v v0, v24
+; CHECK-ZVKB-NEXT:    vmv1r.v v0, v25
 ; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 64 x i16> @llvm.vp.bswap.nxv64i16(<vscale x 64 x i16> %va, <vscale x 64 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index 5b271606f08abe..ddd50284591cb8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -135,21 +135,21 @@ declare <vscale x 8 x half> @llvm.vp.ceil.nxv8f16(<vscale x 8 x half>, <vscale x
 define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI6_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.ceil.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -179,21 +179,21 @@ declare <vscale x 16 x half> @llvm.vp.ceil.nxv16f16(<vscale x 16 x half>, <vscal
 define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI8_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI8_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.ceil.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -223,21 +223,21 @@ declare <vscale x 32 x half> @llvm.vp.ceil.nxv32f16(<vscale x 32 x half>, <vscal
 define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI10_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI10_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.ceil.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -351,21 +351,21 @@ declare <vscale x 4 x float> @llvm.vp.ceil.nxv4f32(<vscale x 4 x float>, <vscale
 define <vscale x 4 x float> @vp_ceil_vv_nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.ceil.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x float> %v
@@ -395,21 +395,21 @@ declare <vscale x 8 x float> @llvm.vp.ceil.nxv8f32(<vscale x 8 x float>, <vscale
 define <vscale x 8 x float> @vp_ceil_vv_nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.ceil.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x float> %v
@@ -439,21 +439,21 @@ declare <vscale x 16 x float> @llvm.vp.ceil.nxv16f32(<vscale x 16 x float>, <vsc
 define <vscale x 16 x float> @vp_ceil_vv_nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.ceil.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x float> %v
@@ -525,21 +525,21 @@ declare <vscale x 2 x double> @llvm.vp.ceil.nxv2f64(<vscale x 2 x double>, <vsca
 define <vscale x 2 x double> @vp_ceil_vv_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.ceil.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x double> %v
@@ -569,21 +569,21 @@ declare <vscale x 4 x double> @llvm.vp.ceil.nxv4f64(<vscale x 4 x double>, <vsca
 define <vscale x 4 x double> @vp_ceil_vv_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.ceil.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x double> %v
@@ -613,21 +613,21 @@ declare <vscale x 7 x double> @llvm.vp.ceil.nxv7f64(<vscale x 7 x double>, <vsca
 define <vscale x 7 x double> @vp_ceil_vv_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv7f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI28_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI28_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.ceil.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x double> %v
@@ -657,21 +657,21 @@ declare <vscale x 8 x double> @llvm.vp.ceil.nxv8f64(<vscale x 8 x double>, <vsca
 define <vscale x 8 x double> @vp_ceil_vv_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI30_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI30_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.ceil.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x double> %v
@@ -702,19 +702,11 @@ declare <vscale x 16 x double> @llvm.vp.ceil.nxv16f64(<vscale x 16 x double>, <v
 define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v25, v0, a2
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -722,51 +714,36 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; CHECK-NEXT:    lui a3, %hi(.LCPI32_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI32_0)(a3)
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a2, 3
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.ceil.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
index 113154c0f9855b..156f20167e275b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
@@ -19,18 +19,18 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
 ; RV32-LABEL: constant_folding_crash:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    andi a0, a0, 1
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; RV32-NEXT:    vmv.v.x v11, a0
-; RV32-NEXT:    vmsne.vi v0, v11, 0
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmsne.vi v0, v10, 0
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vmv1r.v v0, v11
 ; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-NEXT:    vrgather.vi v9, v8, 0
 ; RV32-NEXT:    vmsne.vi v0, v9, 0
@@ -42,18 +42,18 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
 ; RV64-LABEL: constant_folding_crash:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    ld a0, 8(a0)
-; RV64-NEXT:    vmv1r.v v12, v0
+; RV64-NEXT:    vmv1r.v v13, v0
 ; RV64-NEXT:    andi a0, a0, 1
 ; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; RV64-NEXT:    vmv.v.x v13, a0
-; RV64-NEXT:    vmsne.vi v0, v13, 0
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vmsne.vi v0, v12, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV64-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmv1r.v v0, v12
+; RV64-NEXT:    vmv1r.v v0, v13
 ; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-NEXT:    vrgather.vi v9, v8, 0
 ; RV64-NEXT:    vmsne.vi v0, v9, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 2a75e5ce7175d3..288bd0e4b7ebe8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -1235,10 +1235,10 @@ declare <vscale x 16 x i64> @llvm.vp.ctlz.nxv16i64(<vscale x 16 x i64>, i1 immar
 define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv16i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
@@ -1270,10 +1270,10 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv1r.v v25, v0
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
 ; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
@@ -1286,7 +1286,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    mv a0, a1
 ; CHECK-ZVBB-NEXT:  .LBB46_2:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vmv1r.v v0, v24
+; CHECK-ZVBB-NEXT:    vmv1r.v v0, v25
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.ctlz.nxv16i64(<vscale x 16 x i64> %va, i1 false, <vscale x 16 x i1> %m, i32 %evl)
@@ -2465,10 +2465,10 @@ define <vscale x 8 x i64> @vp_ctlz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64
 define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
@@ -2497,10 +2497,10 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv1r.v v25, v0
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
 ; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
@@ -2513,7 +2513,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-ZVBB-NEXT:    mv a0, a1
 ; CHECK-ZVBB-NEXT:  .LBB94_2:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vmv1r.v v0, v24
+; CHECK-ZVBB-NEXT:    vmv1r.v v0, v25
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.ctlz.nxv16i64(<vscale x 16 x i64> %va, i1 true, <vscale x 16 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 883f68aec1f422..094e5bc14f9b25 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -2018,228 +2018,131 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
-; RV32-NEXT:    vmv1r.v v24, v0
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    srli a2, a1, 3
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmv1r.v v7, v0
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
 ; RV32-NEXT:    sub a2, a0, a1
 ; RV32-NEXT:    sltu a3, a0, a2
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    lui a3, 349525
 ; RV32-NEXT:    addi a3, a3, 1365
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a3, 61681
 ; RV32-NEXT:    addi a3, a3, -241
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    lui a3, 4112
 ; RV32-NEXT:    addi a3, a3, 257
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vsrl.vx v8, v16, a2, v0.t
+; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    bltu a0, a1, .LBB46_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB46_2:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v24
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2247,44 +2150,33 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ;
 ; RV64-LABEL: vp_ctpop_nxv16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    srli a2, a1, 3
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v24, v0, a2
+; RV64-NEXT:    vslidedown.vx v7, v0, a2
 ; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    bltu a0, a1, .LBB46_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:  .LBB46_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a2, a2, 1365
 ; RV64-NEXT:    slli a3, a2, 32
 ; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    addiw a3, a3, 819
 ; RV64-NEXT:    slli a4, a3, 32
 ; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    addiw a4, a4, -241
 ; RV64-NEXT:    slli a5, a4, 32
@@ -2297,45 +2189,32 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
 ; RV64-NEXT:    sub a1, a0, a1
 ; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v16, a3, v0.t
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a5, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv1r.v v25, v0
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
 ; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
@@ -2348,7 +2227,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    mv a0, a1
 ; CHECK-ZVBB-NEXT:  .LBB46_2:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vmv1r.v v0, v24
+; CHECK-ZVBB-NEXT:    vmv1r.v v0, v25
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.ctpop.nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
index d13f4d2dca1ff4..c833c76a779ebe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
@@ -42,9 +42,9 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
 ; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vsub.vx v8, v9, a0
-; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv1i8:
@@ -59,9 +59,9 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
 ; CHECK-D-NEXT:    li a0, 127
+; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v9, a0
-; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-D-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv1i8:
@@ -108,9 +108,9 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
 ; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vsub.vx v8, v9, a0
-; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv2i8:
@@ -125,9 +125,9 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
 ; CHECK-D-NEXT:    li a0, 127
+; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v9, a0
-; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-D-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv2i8:
@@ -174,9 +174,9 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
 ; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vsub.vx v8, v9, a0
-; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv4i8:
@@ -191,9 +191,9 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
 ; CHECK-D-NEXT:    li a0, 127
+; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v9, a0
-; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-D-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv4i8:
@@ -240,9 +240,9 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v10, 0
 ; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vsub.vx v8, v9, a0
-; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv8i8:
@@ -257,9 +257,9 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v10, 0
 ; CHECK-D-NEXT:    li a0, 127
+; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v9, a0
-; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-D-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv8i8:
@@ -306,9 +306,9 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v10, v12, 0
 ; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v10, v10, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vsub.vx v8, v10, a0
-; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    vmerge.vim v8, v10, 8, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv16i8:
@@ -323,9 +323,9 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v10, v12, 0
 ; CHECK-D-NEXT:    li a0, 127
+; CHECK-D-NEXT:    vsub.vx v10, v10, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v10, a0
-; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-D-NEXT:    vmerge.vim v8, v10, 8, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv16i8:
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index ef8a6c704a44b3..30572c084badf2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -2242,19 +2242,20 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
-; RV32-NEXT:    vmv1r.v v24, v0
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    srli a2, a1, 3
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmv1r.v v7, v0
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
 ; RV32-NEXT:    sub a2, a0, a1
 ; RV32-NEXT:    sltu a3, a0, a2
@@ -2265,87 +2266,34 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsub.vx v8, v16, a2, v0.t
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    lui a4, 349525
 ; RV32-NEXT:    addi a4, a4, 1365
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v24, a4
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    slli a4, a4, 5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a4, 209715
 ; RV32-NEXT:    addi a4, a4, 819
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v24, a4
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    lui a4, 61681
@@ -2358,18 +2306,18 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    lui a4, 4112
 ; RV32-NEXT:    addi a4, a4, 257
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v16, a4
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a3, v0.t
 ; RV32-NEXT:    addi a4, sp, 16
@@ -2379,89 +2327,33 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB46_2:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vx v8, v16, a2, v0.t
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
@@ -2481,7 +2373,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2489,21 +2381,10 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ;
 ; RV64-LABEL: vp_cttz_nxv16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    vmv1r.v v24, v0
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    srli a2, a1, 3
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmv1r.v v7, v0
 ; RV64-NEXT:    vslidedown.vx v0, v0, a2
 ; RV64-NEXT:    sub a2, a0, a1
 ; RV64-NEXT:    sltu a3, a0, a2
@@ -2511,80 +2392,67 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    and a3, a3, a2
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v8, v16, a2, v0.t
+; RV64-NEXT:    vsub.vx v24, v16, a2, v0.t
 ; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV64-NEXT:    lui a3, 349525
 ; RV64-NEXT:    addiw a3, a3, 1365
 ; RV64-NEXT:    slli a4, a3, 32
 ; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    lui a4, 209715
 ; RV64-NEXT:    addiw a4, a4, 819
 ; RV64-NEXT:    slli a5, a4, 32
 ; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v16, a4, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a4, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    lui a5, 61681
 ; RV64-NEXT:    addiw a5, a5, -241
 ; RV64-NEXT:    slli a6, a5, 32
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    lui a6, 4112
 ; RV64-NEXT:    addiw a6, a6, 257
 ; RV64-NEXT:    slli a7, a6, 32
 ; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    vmul.vx v8, v8, a6, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    li a7, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a7, v0.t
-; RV64-NEXT:    addi t0, sp, 16
-; RV64-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v16, v16, a7, v0.t
 ; RV64-NEXT:    bltu a0, a1, .LBB46_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:  .LBB46_2:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsub.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vsub.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vmul.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a7, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv16i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv1r.v v25, v0
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
 ; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
@@ -2597,7 +2465,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    mv a0, a1
 ; CHECK-ZVBB-NEXT:  .LBB46_2:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vmv1r.v v0, v24
+; CHECK-ZVBB-NEXT:    vmv1r.v v0, v25
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.cttz.nxv16i64(<vscale x 16 x i64> %va, i1 false, <vscale x 16 x i1> %m, i32 %evl)
@@ -4002,70 +3870,46 @@ define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64
 define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv16i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vrsub.vi v8, v16, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vrsub.vi v24, v16, 0, v0.t
+; CHECK-NEXT:    vand.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    fsrmi a2, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
 ; CHECK-NEXT:    li a2, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    vsrl.vx v16, v16, a2, v0.t
 ; CHECK-NEXT:    li a3, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    addi a4, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsub.vx v16, v16, a3, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB94_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB94_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vrsub.vi v24, v8, 0, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsub.vx v8, v8, a3, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv1r.v v25, v0
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
 ; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
@@ -4078,7 +3922,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-ZVBB-NEXT:    mv a0, a1
 ; CHECK-ZVBB-NEXT:  .LBB94_2:
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vmv1r.v v0, v24
+; CHECK-ZVBB-NEXT:    vmv1r.v v0, v25
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.cttz.nxv16i64(<vscale x 16 x i64> %va, i1 true, <vscale x 16 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
index e69b4789a09afe..3972f528bc32e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
@@ -145,18 +145,18 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
 ; RV32-NEXT:    addi a3, sp, 64
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a4, a0, a2
-; RV32-NEXT:    vl8r.v v16, (a4)
-; RV32-NEXT:    vl8r.v v24, (a0)
+; RV32-NEXT:    vl8r.v v8, (a4)
+; RV32-NEXT:    vl8r.v v16, (a0)
 ; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV32-NEXT:    vmseq.vi v8, v16, 0
-; RV32-NEXT:    vmseq.vi v0, v24, 0
-; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV32-NEXT:    vs8r.v v24, (a3)
+; RV32-NEXT:    vmseq.vi v24, v8, 0
+; RV32-NEXT:    vmseq.vi v0, v16, 0
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV32-NEXT:    vs8r.v v16, (a3)
 ; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-NEXT:    vs8r.v v8, (a2)
 ; RV32-NEXT:    lbu a0, 0(a1)
 ; RV32-NEXT:    addi sp, s0, -80
@@ -185,18 +185,18 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
 ; RV64-NEXT:    addi a3, sp, 64
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a4, a0, a2
-; RV64-NEXT:    vl8r.v v16, (a4)
-; RV64-NEXT:    vl8r.v v24, (a0)
+; RV64-NEXT:    vl8r.v v8, (a4)
+; RV64-NEXT:    vl8r.v v16, (a0)
 ; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV64-NEXT:    vmseq.vi v8, v16, 0
-; RV64-NEXT:    vmseq.vi v0, v24, 0
-; RV64-NEXT:    vmv.v.i v16, 0
-; RV64-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV64-NEXT:    vs8r.v v24, (a3)
+; RV64-NEXT:    vmseq.vi v24, v8, 0
+; RV64-NEXT:    vmseq.vi v0, v16, 0
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV64-NEXT:    vs8r.v v16, (a3)
 ; RV64-NEXT:    add a2, a3, a2
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-NEXT:    vs8r.v v8, (a2)
 ; RV64-NEXT:    lbu a0, 0(a1)
 ; RV64-NEXT:    addi sp, s0, -80
diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
index 75747a6674b7b4..ff5e8fb539a782 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
@@ -8,9 +8,9 @@ define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -30,9 +30,9 @@ define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -52,9 +52,9 @@ define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -74,9 +74,9 @@ define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -96,9 +96,9 @@ define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -118,9 +118,9 @@ define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
@@ -250,9 +250,9 @@ define <vscale x 1 x double> @ceil_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -272,9 +272,9 @@ define <vscale x 2 x double> @ceil_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -294,9 +294,9 @@ define <vscale x 4 x double> @ceil_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -316,9 +316,9 @@ define <vscale x 8 x double> @ceil_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
index 31a94532044574..57e77708791662 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
@@ -8,9 +8,9 @@ define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -30,9 +30,9 @@ define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -52,9 +52,9 @@ define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -74,9 +74,9 @@ define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -96,9 +96,9 @@ define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -118,9 +118,9 @@ define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
@@ -250,9 +250,9 @@ define <vscale x 1 x double> @floor_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: floor_nxv1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -272,9 +272,9 @@ define <vscale x 2 x double> @floor_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: floor_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -294,9 +294,9 @@ define <vscale x 4 x double> @floor_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: floor_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -316,9 +316,9 @@ define <vscale x 8 x double> @floor_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: floor_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index 943fc58d637a00..33e88699b3523b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -1390,29 +1390,29 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e
 ; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v12, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v20, v12, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a5), zero
+; RV32-NEXT:    vlse64.v v20, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v20, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
-; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
-; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v20, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
@@ -1687,7 +1687,9 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    addi a5, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a5, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
@@ -1697,36 +1699,38 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV32-NEXT:    addi a5, a5, 48
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
+; RV32-NEXT:    addi a5, sp, 48
+; RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 4
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 48
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 4
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 48
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v24, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a4, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1790,27 +1794,27 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    lui a3, 16
 ; RV64-NEXT:    addiw a3, a3, -256
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsll.vx v24, v24, a4, v0.t
+; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    addi a5, sp, 16
 ; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -2051,7 +2055,9 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    addi a5, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a5, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
@@ -2061,36 +2067,38 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV32-NEXT:    addi a5, a5, 48
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
+; RV32-NEXT:    addi a5, sp, 48
+; RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 4
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 48
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 4
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 48
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v24, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a4, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2154,27 +2162,27 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    lui a3, 16
 ; RV64-NEXT:    addiw a3, a3, -256
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsll.vx v24, v24, a4, v0.t
+; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    addi a5, sp, 16
 ; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -2377,87 +2385,63 @@ declare <128 x i16> @llvm.vp.bitreverse.v128i16(<128 x i16>, <128 x i1>, i32)
 define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_bitreverse_v128i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    li a2, 64
-; CHECK-NEXT:    vslidedown.vi v24, v0, 8
+; CHECK-NEXT:    vslidedown.vi v7, v0, 8
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 64
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 4, v0.t
 ; CHECK-NEXT:    lui a1, 1
 ; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a1, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT:    vor.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 2, v0.t
 ; CHECK-NEXT:    lui a2, 3
 ; CHECK-NEXT:    addi a2, a2, 819
-; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vor.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; CHECK-NEXT:    lui a3, 5
 ; CHECK-NEXT:    addi a3, a3, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a3, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a4, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vor.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    addi a4, a0, -64
 ; CHECK-NEXT:    sltu a0, a0, a4
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a4
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsrl.vi v24, v16, 8, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
+; CHECK-NEXT:    vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a1, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 4, v0.t
+; CHECK-NEXT:    vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a2, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a3, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vor.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vsll.vi v16, v16, 1, v0.t
+; CHECK-NEXT:    vor.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <128 x i16> @llvm.vp.bitreverse.v128i16(<128 x i16> %va, <128 x i1> %m, i32 %evl)
   ret <128 x i16> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index f80d4e5c0d7c3f..ab212bcad5e2df 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -610,29 +610,29 @@ define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v12, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v20, v12, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a5), zero
+; RV32-NEXT:    vlse64.v v20, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v20, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
-; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
-; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v20, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
@@ -781,7 +781,9 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
@@ -791,36 +793,38 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v24, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a4, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -857,27 +861,27 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    lui a3, 16
 ; RV64-NEXT:    addiw a3, a3, -256
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsll.vx v24, v24, a4, v0.t
+; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    addi a5, sp, 16
 ; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -1012,7 +1016,9 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
@@ -1022,36 +1028,38 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v24, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a4, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1088,27 +1096,27 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    lui a3, 16
 ; RV64-NEXT:    addiw a3, a3, -256
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsll.vx v24, v24, a4, v0.t
+; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    addi a5, sp, 16
 ; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -1217,51 +1225,27 @@ declare <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16>, <128 x i1>, i32)
 define <128 x i16> @vp_bswap_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_bswap_v128i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    li a2, 64
-; CHECK-NEXT:    vslidedown.vi v24, v0, 8
+; CHECK-NEXT:    vslidedown.vi v7, v0, 8
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 64
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    addi a1, a0, -64
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsrl.vi v24, v16, 8, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
+; CHECK-NEXT:    vor.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    ret
   %v = call <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16> %va, <128 x i1> %m, i32 %evl)
   ret <128 x i16> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
index af7d7f7ae755bc..886e874b9f9584 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
@@ -126,14 +126,14 @@ define <4 x i32> @udiv_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    vsub.vv v10, v8, v9
 ; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    lui a0, 524288
+; CHECK-NEXT:    lui a1, %hi(.LCPI4_1)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI4_1)
+; CHECK-NEXT:    vle32.v v12, (a1)
 ; CHECK-NEXT:    vslide1down.vx v11, v11, a0
-; CHECK-NEXT:    lui a0, %hi(.LCPI4_1)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI4_1)
-; CHECK-NEXT:    vle32.v v12, (a0)
 ; CHECK-NEXT:    vmulhu.vv v10, v10, v11
 ; CHECK-NEXT:    vadd.vv v9, v10, v9
-; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vsrl.vv v9, v9, v12
+; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    ret
   %e0 = udiv i32 %a, 23
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
index 5d024f140fd5d6..5348e7df168c41 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
@@ -194,23 +194,23 @@ define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_ceil_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -261,42 +261,42 @@ declare <16 x half> @llvm.vp.ceil.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_v16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v10, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v13, v0
+; ZVFH-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 3
-; ZVFH-NEXT:    vmv1r.v v0, v10
-; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_ceil_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v17, v0
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -431,21 +431,21 @@ declare <8 x float> @llvm.vp.ceil.v8f32(<8 x float>, <8 x i1>, i32)
 define <8 x float> @vp_ceil_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.ceil.v8f32(<8 x float> %va, <8 x i1> %m, i32 %evl)
   ret <8 x float> %v
@@ -475,21 +475,21 @@ declare <16 x float> @llvm.vp.ceil.v16f32(<16 x float>, <16 x i1>, i32)
 define <16 x float> @vp_ceil_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.ceil.v16f32(<16 x float> %va, <16 x i1> %m, i32 %evl)
   ret <16 x float> %v
@@ -561,21 +561,21 @@ declare <4 x double> @llvm.vp.ceil.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_ceil_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.ceil.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl)
   ret <4 x double> %v
@@ -605,21 +605,21 @@ declare <8 x double> @llvm.vp.ceil.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_ceil_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.ceil.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl)
   ret <8 x double> %v
@@ -649,21 +649,21 @@ declare <15 x double> @llvm.vp.ceil.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_ceil_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v15f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.ceil.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl)
   ret <15 x double> %v
@@ -693,21 +693,21 @@ declare <16 x double> @llvm.vp.ceil.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_ceil_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.ceil.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl)
   ret <16 x double> %v
@@ -737,21 +737,9 @@ declare <32 x double> @llvm.vp.ceil.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    li a2, 16
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -760,48 +748,36 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a1, 3
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.ceil.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 2f4539d5038c2a..ac14ead7c951c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -1986,17 +1986,18 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 44(sp)
@@ -2034,112 +2035,55 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, sp, 32
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, sp, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -2148,97 +2092,63 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 2, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 48
@@ -2246,57 +2156,46 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ;
 ; RV64-LABEL: vp_ctlz_v32i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vslidedown.vi v7, v0, 2
 ; RV64-NEXT:    mv a1, a0
 ; RV64-NEXT:    bltu a0, a2, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB34_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 2, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 8, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 16, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vx v24, v8, a1, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a2, a2, 1365
 ; RV64-NEXT:    slli a3, a2, 32
 ; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    addiw a3, a3, 819
 ; RV64-NEXT:    slli a4, a3, 32
 ; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    addiw a4, a4, -241
 ; RV64-NEXT:    slli a5, a4, 32
@@ -2309,50 +2208,37 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
 ; RV64-NEXT:    addi a7, a0, -16
 ; RV64-NEXT:    sltu a0, a0, a7
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a7
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vor.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v8, v16, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 2, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 8, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 16, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vx v24, v16, a1, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vnot.v v16, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a5, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 false, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
@@ -4539,17 +4425,18 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 44(sp)
@@ -4587,112 +4474,55 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, sp, 32
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, sp, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -4701,97 +4531,63 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 2, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 48
@@ -4799,57 +4595,46 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v32i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vslidedown.vi v7, v0, 2
 ; RV64-NEXT:    mv a1, a0
 ; RV64-NEXT:    bltu a0, a2, .LBB70_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB70_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 2, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 8, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 16, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vx v24, v8, a1, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a2, a2, 1365
 ; RV64-NEXT:    slli a3, a2, 32
 ; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    addiw a3, a3, 819
 ; RV64-NEXT:    slli a4, a3, 32
 ; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    addiw a4, a4, -241
 ; RV64-NEXT:    slli a5, a4, 32
@@ -4862,50 +4647,37 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
 ; RV64-NEXT:    addi a7, a0, -16
 ; RV64-NEXT:    sltu a0, a0, a7
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a7
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vor.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v8, v16, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 2, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 8, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 16, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vx v24, v16, a1, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vnot.v v16, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a5, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 true, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index 0b6d8b33394d50..ae34d056251a63 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1528,72 +1528,50 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a2, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v8, (a2), zero
+; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a2, sp, 32
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a2), zero
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    addi a2, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a2), zero
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    addi a2, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    vlse64.v v8, (a2), zero
 ; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmul.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vsrl.vx v8, v24, a1, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
@@ -1610,25 +1588,25 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vsub.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
@@ -1655,43 +1633,32 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ;
 ; RV64-LABEL: vp_ctpop_v32i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vslidedown.vi v7, v0, 2
 ; RV64-NEXT:    mv a1, a0
 ; RV64-NEXT:    bltu a0, a2, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB34_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV64-NEXT:    lui a1, 349525
 ; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    slli a2, a1, 32
 ; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    lui a2, 209715
 ; RV64-NEXT:    addiw a2, a2, 819
 ; RV64-NEXT:    slli a3, a2, 32
 ; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    lui a3, 61681
 ; RV64-NEXT:    addiw a3, a3, -241
 ; RV64-NEXT:    slli a4, a3, 32
@@ -1704,37 +1671,24 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    li a5, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a5, v0.t
-; RV64-NEXT:    addi a6, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
 ; RV64-NEXT:    addi a6, a0, -16
 ; RV64-NEXT:    sltu a0, a0, a6
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a6
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v16, a2, v0.t
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a4, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64> %va, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index f2926fa91e5c26..b1c25be7d3430e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1666,17 +1666,18 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 44(sp)
@@ -1692,124 +1693,67 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB34_2
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB34_2:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT:    li a2, 1
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, sp, 32
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, sp, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -1818,87 +1762,53 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v8, v24, a2, v0.t
+; RV32-NEXT:    vnot.v v24, v24, v0.t
+; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 48
@@ -1906,47 +1816,36 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ;
 ; RV64-LABEL: vp_cttz_v32i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a2, a0
-; RV64-NEXT:    bltu a0, a1, .LBB34_2
-; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v7, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB34_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB34_2:
-; RV64-NEXT:    li a1, 1
-; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    li a2, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsub.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    addiw a3, a3, 819
 ; RV64-NEXT:    slli a4, a3, 32
 ; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    addiw a4, a4, -241
 ; RV64-NEXT:    slli a5, a4, 32
@@ -1959,40 +1858,27 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
 ; RV64-NEXT:    addi a7, a0, -16
 ; RV64-NEXT:    sltu a0, a0, a7
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a7
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vsub.vx v24, v16, a2, v0.t
+; RV64-NEXT:    vnot.v v16, v16, v0.t
+; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a5, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 false, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
@@ -3819,17 +3705,18 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 44(sp)
@@ -3845,124 +3732,67 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB70_2
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB70_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB70_2:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT:    li a2, 1
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, sp, 32
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, sp, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -3971,87 +3801,53 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v8, v24, a2, v0.t
+; RV32-NEXT:    vnot.v v24, v24, v0.t
+; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 48
@@ -4059,47 +3855,36 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v32i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a2, a0
-; RV64-NEXT:    bltu a0, a1, .LBB70_2
-; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v7, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB70_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB70_2:
-; RV64-NEXT:    li a1, 1
-; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    li a2, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsub.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    addiw a3, a3, 819
 ; RV64-NEXT:    slli a4, a3, 32
 ; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    addiw a4, a4, -241
 ; RV64-NEXT:    slli a5, a4, 32
@@ -4112,40 +3897,27 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
 ; RV64-NEXT:    addi a7, a0, -16
 ; RV64-NEXT:    sltu a0, a0, a7
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a7
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vsub.vx v24, v16, a2, v0.t
+; RV64-NEXT:    vnot.v v16, v16, v0.t
+; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a5, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 true, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index 8c8da6d1e00313..309d568322a8b7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -45,9 +45,9 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RVF-NEXT:    vnsrl.wi v9, v10, 0
 ; RVF-NEXT:    li a1, 127
+; RVF-NEXT:    vsub.vx v9, v9, a1
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
-; RVF-NEXT:    vsub.vx v8, v9, a1
-; RVF-NEXT:    vmerge.vim v8, v8, 8, v0
+; RVF-NEXT:    vmerge.vim v8, v9, 8, v0
 ; RVF-NEXT:    vse8.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -64,9 +64,9 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RVD-NEXT:    vnsrl.wi v9, v10, 0
 ; RVD-NEXT:    li a1, 127
+; RVD-NEXT:    vsub.vx v9, v9, a1
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
-; RVD-NEXT:    vsub.vx v8, v9, a1
-; RVD-NEXT:    vmerge.vim v8, v8, 8, v0
+; RVD-NEXT:    vmerge.vim v8, v9, 8, v0
 ; RVD-NEXT:    vse8.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -444,9 +444,9 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; RVF-NEXT:    vnsrl.wi v10, v12, 0
 ; RVF-NEXT:    li a1, 127
+; RVF-NEXT:    vsub.vx v10, v10, a1
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
-; RVF-NEXT:    vsub.vx v8, v10, a1
-; RVF-NEXT:    vmerge.vim v8, v8, 8, v0
+; RVF-NEXT:    vmerge.vim v8, v10, 8, v0
 ; RVF-NEXT:    vse8.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -464,9 +464,9 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; RVD-NEXT:    vnsrl.wi v10, v12, 0
 ; RVD-NEXT:    li a1, 127
+; RVD-NEXT:    vsub.vx v10, v10, a1
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
-; RVD-NEXT:    vsub.vx v8, v10, a1
-; RVD-NEXT:    vmerge.vim v8, v8, 8, v0
+; RVD-NEXT:    vmerge.vim v8, v10, 8, v0
 ; RVD-NEXT:    vse8.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
index dccb62877af3cc..232d453c1cfa4c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
@@ -10,9 +10,9 @@ define i1 @extractelt_v1i1(ptr %x, i64 %idx) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vmv.s.x v9, zero
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -328,17 +328,17 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV32-NEXT:    li a2, 128
 ; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV32-NEXT:    addi a2, a0, 128
-; RV32-NEXT:    vle8.v v16, (a2)
-; RV32-NEXT:    vle8.v v24, (a0)
+; RV32-NEXT:    vle8.v v8, (a2)
+; RV32-NEXT:    vle8.v v16, (a0)
 ; RV32-NEXT:    mv a0, sp
 ; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    vmseq.vi v8, v16, 0
-; RV32-NEXT:    vmseq.vi v0, v24, 0
-; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV32-NEXT:    vse8.v v24, (a0)
-; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV32-NEXT:    vmseq.vi v24, v8, 0
+; RV32-NEXT:    vmseq.vi v0, v16, 0
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV32-NEXT:    vse8.v v16, (a0)
+; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-NEXT:    addi a0, sp, 128
 ; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    lbu a0, 0(a1)
@@ -359,17 +359,17 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV64-NEXT:    li a2, 128
 ; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV64-NEXT:    addi a2, a0, 128
-; RV64-NEXT:    vle8.v v16, (a2)
-; RV64-NEXT:    vle8.v v24, (a0)
+; RV64-NEXT:    vle8.v v8, (a2)
+; RV64-NEXT:    vle8.v v16, (a0)
 ; RV64-NEXT:    mv a0, sp
 ; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    vmseq.vi v8, v16, 0
-; RV64-NEXT:    vmseq.vi v0, v24, 0
-; RV64-NEXT:    vmv.v.i v16, 0
-; RV64-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV64-NEXT:    vse8.v v24, (a0)
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV64-NEXT:    vmseq.vi v24, v8, 0
+; RV64-NEXT:    vmseq.vi v0, v16, 0
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV64-NEXT:    vse8.v v16, (a0)
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-NEXT:    addi a0, sp, 128
 ; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    lbu a0, 0(a1)
@@ -390,17 +390,17 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV32ZBS-NEXT:    li a2, 128
 ; RV32ZBS-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV32ZBS-NEXT:    addi a2, a0, 128
-; RV32ZBS-NEXT:    vle8.v v16, (a2)
-; RV32ZBS-NEXT:    vle8.v v24, (a0)
+; RV32ZBS-NEXT:    vle8.v v8, (a2)
+; RV32ZBS-NEXT:    vle8.v v16, (a0)
 ; RV32ZBS-NEXT:    mv a0, sp
 ; RV32ZBS-NEXT:    add a1, a0, a1
-; RV32ZBS-NEXT:    vmseq.vi v8, v16, 0
-; RV32ZBS-NEXT:    vmseq.vi v0, v24, 0
-; RV32ZBS-NEXT:    vmv.v.i v16, 0
-; RV32ZBS-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV32ZBS-NEXT:    vse8.v v24, (a0)
-; RV32ZBS-NEXT:    vmv1r.v v0, v8
-; RV32ZBS-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV32ZBS-NEXT:    vmseq.vi v24, v8, 0
+; RV32ZBS-NEXT:    vmseq.vi v0, v16, 0
+; RV32ZBS-NEXT:    vmv.v.i v8, 0
+; RV32ZBS-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV32ZBS-NEXT:    vse8.v v16, (a0)
+; RV32ZBS-NEXT:    vmv1r.v v0, v24
+; RV32ZBS-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32ZBS-NEXT:    addi a0, sp, 128
 ; RV32ZBS-NEXT:    vse8.v v8, (a0)
 ; RV32ZBS-NEXT:    lbu a0, 0(a1)
@@ -421,17 +421,17 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV64ZBS-NEXT:    li a2, 128
 ; RV64ZBS-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV64ZBS-NEXT:    addi a2, a0, 128
-; RV64ZBS-NEXT:    vle8.v v16, (a2)
-; RV64ZBS-NEXT:    vle8.v v24, (a0)
+; RV64ZBS-NEXT:    vle8.v v8, (a2)
+; RV64ZBS-NEXT:    vle8.v v16, (a0)
 ; RV64ZBS-NEXT:    mv a0, sp
 ; RV64ZBS-NEXT:    add a1, a0, a1
-; RV64ZBS-NEXT:    vmseq.vi v8, v16, 0
-; RV64ZBS-NEXT:    vmseq.vi v0, v24, 0
-; RV64ZBS-NEXT:    vmv.v.i v16, 0
-; RV64ZBS-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV64ZBS-NEXT:    vse8.v v24, (a0)
-; RV64ZBS-NEXT:    vmv1r.v v0, v8
-; RV64ZBS-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV64ZBS-NEXT:    vmseq.vi v24, v8, 0
+; RV64ZBS-NEXT:    vmseq.vi v0, v16, 0
+; RV64ZBS-NEXT:    vmv.v.i v8, 0
+; RV64ZBS-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV64ZBS-NEXT:    vse8.v v16, (a0)
+; RV64ZBS-NEXT:    vmv1r.v v0, v24
+; RV64ZBS-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64ZBS-NEXT:    addi a0, sp, 128
 ; RV64ZBS-NEXT:    vse8.v v8, (a0)
 ; RV64ZBS-NEXT:    lbu a0, 0(a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
index 1e93a73ede5d65..ea6cdab0bd3206 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
@@ -8,9 +8,9 @@ define <1 x half> @ceil_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -30,9 +30,9 @@ define <2 x half> @ceil_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -52,9 +52,9 @@ define <4 x half> @ceil_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -74,9 +74,9 @@ define <8 x half> @ceil_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -96,9 +96,9 @@ define <16 x half> @ceil_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -119,9 +119,9 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -251,9 +251,9 @@ define <1 x double> @ceil_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_v1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -273,9 +273,9 @@ define <2 x double> @ceil_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -295,9 +295,9 @@ define <4 x double> @ceil_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -317,9 +317,9 @@ define <8 x double> @ceil_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
index 53018939fc6eb4..4eca68e8e007ba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
@@ -8,9 +8,9 @@ define <1 x half> @floor_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -30,9 +30,9 @@ define <2 x half> @floor_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -52,9 +52,9 @@ define <4 x half> @floor_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -74,9 +74,9 @@ define <8 x half> @floor_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -96,9 +96,9 @@ define <16 x half> @floor_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -119,9 +119,9 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -251,9 +251,9 @@ define <1 x double> @floor_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: floor_v1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -273,9 +273,9 @@ define <2 x double> @floor_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: floor_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -295,9 +295,9 @@ define <4 x double> @floor_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: floor_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -317,9 +317,9 @@ define <8 x double> @floor_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: floor_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
index 6c2be509f7c22e..6915722ace2d2e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
@@ -194,23 +194,23 @@ define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_floor_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -261,42 +261,42 @@ declare <16 x half> @llvm.vp.floor.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_v16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v10, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v13, v0
+; ZVFH-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 2
-; ZVFH-NEXT:    vmv1r.v v0, v10
-; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_floor_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v17, v0
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -431,21 +431,21 @@ declare <8 x float> @llvm.vp.floor.v8f32(<8 x float>, <8 x i1>, i32)
 define <8 x float> @vp_floor_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.floor.v8f32(<8 x float> %va, <8 x i1> %m, i32 %evl)
   ret <8 x float> %v
@@ -475,21 +475,21 @@ declare <16 x float> @llvm.vp.floor.v16f32(<16 x float>, <16 x i1>, i32)
 define <16 x float> @vp_floor_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.floor.v16f32(<16 x float> %va, <16 x i1> %m, i32 %evl)
   ret <16 x float> %v
@@ -561,21 +561,21 @@ declare <4 x double> @llvm.vp.floor.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_floor_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.floor.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl)
   ret <4 x double> %v
@@ -605,21 +605,21 @@ declare <8 x double> @llvm.vp.floor.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_floor_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.floor.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl)
   ret <8 x double> %v
@@ -649,21 +649,21 @@ declare <15 x double> @llvm.vp.floor.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_floor_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v15f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.floor.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl)
   ret <15 x double> %v
@@ -693,21 +693,21 @@ declare <16 x double> @llvm.vp.floor.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_floor_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.floor.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl)
   ret <16 x double> %v
@@ -737,21 +737,9 @@ declare <32 x double> @llvm.vp.floor.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    li a2, 16
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -760,48 +748,36 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a1, 2
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.floor.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
index 02c2fafc89785f..ffc09b866284a3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
@@ -15,11 +15,10 @@ define <2 x half> @vfmax_v2f16_vv(<2 x half> %a, <2 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    vfmax.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmax_v2f16_vv:
@@ -29,11 +28,10 @@ define <2 x half> @vfmax_v2f16_vv(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -48,11 +46,10 @@ define <4 x half> @vfmax_v4f16_vv(<4 x half> %a, <4 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    vfmax.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmax_v4f16_vv:
@@ -62,11 +59,10 @@ define <4 x half> @vfmax_v4f16_vv(<4 x half> %a, <4 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -81,11 +77,10 @@ define <8 x half> @vfmax_v8f16_vv(<8 x half> %a, <8 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    vfmax.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmax_v8f16_vv:
@@ -95,11 +90,10 @@ define <8 x half> @vfmax_v8f16_vv(<8 x half> %a, <8 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmerge.vvm v14, v12, v10, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
-; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v14
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
+; ZVFHMIN-NEXT:    vfmax.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -114,11 +108,10 @@ define <16 x half> @vfmax_v16f16_vv(<16 x half> %a, <16 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
-; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT:    vfmax.vv v8, v8, v14
+; ZVFH-NEXT:    vfmax.vv v8, v8, v12
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmax_v16f16_vv:
@@ -128,11 +121,10 @@ define <16 x half> @vfmax_v16f16_vv(<16 x half> %a, <16 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v20, v16, v12, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
-; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v20
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v12, v12, v16, v0
+; ZVFHMIN-NEXT:    vfmax.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -147,11 +139,10 @@ define <2 x float> @vfmax_v2f32_vv(<2 x float> %a, <2 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v11
+; CHECK-NEXT:    vfmax.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %v = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %v
@@ -164,11 +155,10 @@ define <4 x float> @vfmax_v4f32_vv(<4 x float> %a, <4 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v11
+; CHECK-NEXT:    vfmax.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %v = call <4 x float> @llvm.maximum.v4f32(<4 x float> %a, <4 x float> %b)
   ret <4 x float> %v
@@ -181,11 +171,10 @@ define <8 x float> @vfmax_v8f32_vv(<8 x float> %a, <8 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
-; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v14
+; CHECK-NEXT:    vfmax.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.maximum.v8f32(<8 x float> %a, <8 x float> %b)
   ret <8 x float> %v
@@ -198,11 +187,10 @@ define <16 x float> @vfmax_v16f32_vv(<16 x float> %a, <16 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
-; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v20
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %v
@@ -215,11 +203,10 @@ define <2 x double> @vfmax_v2f64_vv(<2 x double> %a, <2 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v11
+; CHECK-NEXT:    vfmax.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %v = call <2 x double> @llvm.maximum.v2f64(<2 x double> %a, <2 x double> %b)
   ret <2 x double> %v
@@ -232,11 +219,10 @@ define <4 x double> @vfmax_v4f64_vv(<4 x double> %a, <4 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
-; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v14
+; CHECK-NEXT:    vfmax.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
   ret <4 x double> %v
@@ -249,11 +235,10 @@ define <8 x double> @vfmax_v8f64_vv(<8 x double> %a, <8 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
-; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v20
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.maximum.v8f64(<8 x double> %a, <8 x double> %b)
   ret <8 x double> %v
@@ -266,9 +251,8 @@ define <16 x double> @vfmax_v16f64_vv(<16 x double> %a, <16 x double> %b) nounwi
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v24
 ; CHECK-NEXT:    ret
@@ -302,13 +286,12 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFH-LABEL: vfmax_v2f16_vv_nnana:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT:    vfadd.vv v10, v8, v8
+; ZVFH-NEXT:    vfadd.vv v8, v8, v8
 ; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFH-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFH-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFH-NEXT:    vmv1r.v v0, v8
-; ZVFH-NEXT:    vmerge.vvm v8, v10, v9, v0
-; ZVFH-NEXT:    vfmax.vv v8, v11, v8
+; ZVFH-NEXT:    vmerge.vvm v10, v9, v8, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmerge.vvm v8, v8, v9, v0
+; ZVFH-NEXT:    vfmax.vv v8, v10, v8
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmax_v2f16_vv_nnana:
@@ -319,16 +302,15 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -343,13 +325,12 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFH-LABEL: vfmax_v2f16_vv_nnanb:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT:    vfadd.vv v10, v9, v9
+; ZVFH-NEXT:    vfadd.vv v9, v9, v9
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v9, v10, v10
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v10, v0
-; ZVFH-NEXT:    vmv1r.v v0, v9
-; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmax.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmax_v2f16_vv_nnanb:
@@ -364,12 +345,11 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v9, v11, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v9, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v9, v8, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
index b15d697f0754ed..6f34e487ee05de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
@@ -15,11 +15,10 @@ define <2 x half> @vfmin_v2f16_vv(<2 x half> %a, <2 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    vfmin.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmin_v2f16_vv:
@@ -29,11 +28,10 @@ define <2 x half> @vfmin_v2f16_vv(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -48,11 +46,10 @@ define <4 x half> @vfmin_v4f16_vv(<4 x half> %a, <4 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    vfmin.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmin_v4f16_vv:
@@ -62,11 +59,10 @@ define <4 x half> @vfmin_v4f16_vv(<4 x half> %a, <4 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -81,11 +77,10 @@ define <8 x half> @vfmin_v8f16_vv(<8 x half> %a, <8 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    vfmin.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmin_v8f16_vv:
@@ -95,11 +90,10 @@ define <8 x half> @vfmin_v8f16_vv(<8 x half> %a, <8 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmerge.vvm v14, v12, v10, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
-; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v14
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
+; ZVFHMIN-NEXT:    vfmin.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -114,11 +108,10 @@ define <16 x half> @vfmin_v16f16_vv(<16 x half> %a, <16 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
-; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT:    vfmin.vv v8, v8, v14
+; ZVFH-NEXT:    vfmin.vv v8, v8, v12
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmin_v16f16_vv:
@@ -128,11 +121,10 @@ define <16 x half> @vfmin_v16f16_vv(<16 x half> %a, <16 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v20, v16, v12, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
-; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v20
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v12, v12, v16, v0
+; ZVFHMIN-NEXT:    vfmin.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -147,11 +139,10 @@ define <2 x float> @vfmin_v2f32_vv(<2 x float> %a, <2 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v11
+; CHECK-NEXT:    vfmin.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %v = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %v
@@ -164,11 +155,10 @@ define <4 x float> @vfmin_v4f32_vv(<4 x float> %a, <4 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v11
+; CHECK-NEXT:    vfmin.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %v = call <4 x float> @llvm.minimum.v4f32(<4 x float> %a, <4 x float> %b)
   ret <4 x float> %v
@@ -181,11 +171,10 @@ define <8 x float> @vfmin_v8f32_vv(<8 x float> %a, <8 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
-; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v14
+; CHECK-NEXT:    vfmin.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.minimum.v8f32(<8 x float> %a, <8 x float> %b)
   ret <8 x float> %v
@@ -198,11 +187,10 @@ define <16 x float> @vfmin_v16f32_vv(<16 x float> %a, <16 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
-; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v20
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.minimum.v16f32(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %v
@@ -215,11 +203,10 @@ define <2 x double> @vfmin_v2f64_vv(<2 x double> %a, <2 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v11
+; CHECK-NEXT:    vfmin.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %v = call <2 x double> @llvm.minimum.v2f64(<2 x double> %a, <2 x double> %b)
   ret <2 x double> %v
@@ -232,11 +219,10 @@ define <4 x double> @vfmin_v4f64_vv(<4 x double> %a, <4 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
-; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v14
+; CHECK-NEXT:    vfmin.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
   ret <4 x double> %v
@@ -249,11 +235,10 @@ define <8 x double> @vfmin_v8f64_vv(<8 x double> %a, <8 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
-; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v20
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.minimum.v8f64(<8 x double> %a, <8 x double> %b)
   ret <8 x double> %v
@@ -266,9 +251,8 @@ define <16 x double> @vfmin_v16f64_vv(<16 x double> %a, <16 x double> %b) nounwi
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v24
 ; CHECK-NEXT:    ret
@@ -302,13 +286,12 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFH-LABEL: vfmin_v2f16_vv_nnana:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT:    vfadd.vv v10, v8, v8
+; ZVFH-NEXT:    vfadd.vv v8, v8, v8
 ; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFH-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFH-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFH-NEXT:    vmv1r.v v0, v8
-; ZVFH-NEXT:    vmerge.vvm v8, v10, v9, v0
-; ZVFH-NEXT:    vfmin.vv v8, v11, v8
+; ZVFH-NEXT:    vmerge.vvm v10, v9, v8, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmerge.vvm v8, v8, v9, v0
+; ZVFH-NEXT:    vfmin.vv v8, v10, v8
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmin_v2f16_vv_nnana:
@@ -319,16 +302,15 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -343,13 +325,12 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFH-LABEL: vfmin_v2f16_vv_nnanb:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT:    vfadd.vv v10, v9, v9
+; ZVFH-NEXT:    vfadd.vv v9, v9, v9
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v9, v10, v10
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v10, v0
-; ZVFH-NEXT:    vmv1r.v v0, v9
-; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmin.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmin_v2f16_vv_nnanb:
@@ -364,12 +345,11 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v9, v11, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v9, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v9, v8, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
index 1b50214bbf164d..d0d1bf4241c76a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
@@ -10,9 +10,9 @@ define <2 x half> @nearbyint_v2f16(<2 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -33,9 +33,9 @@ define <4 x half> @nearbyint_v4f16(<4 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -56,9 +56,9 @@ define <8 x half> @nearbyint_v8f16(<8 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -79,9 +79,9 @@ define <16 x half> @nearbyint_v16f16(<16 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -103,9 +103,9 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -218,9 +218,9 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI9_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -241,9 +241,9 @@ define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI10_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -264,9 +264,9 @@ define <8 x double> @nearbyint_v8f64(<8 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 8e214e40547832..9e83efd3519539 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -1407,8 +1407,8 @@ define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float
 ; CHECK-NEXT:    vfmv.v.f v8, fa4
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa6
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa7
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    ret
   %v0 = insertelement <8 x float> poison, float %e0, i64 0
@@ -1458,8 +1458,8 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d
 ; CHECK-NEXT:    vfmv.v.f v8, fa4
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa6
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa7
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    ret
   %v0 = insertelement <8 x double> poison, double %e0, i64 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index 6bfd0ac932672f..e201d5d2ea3e73 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -57,8 +57,8 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
 ; RV32-V512-NEXT:    vid.v v10
 ; RV32-V512-NEXT:    vsrl.vi v11, v10, 1
 ; RV32-V512-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV32-V512-NEXT:    vmv.v.i v0, 10
 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v8, v11
+; RV32-V512-NEXT:    vmv.v.i v0, 10
 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v9, v11, v0.t
 ; RV32-V512-NEXT:    vmv.v.v v8, v10
 ; RV32-V512-NEXT:    ret
@@ -68,8 +68,8 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
 ; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, mu
 ; RV64-V512-NEXT:    vid.v v10
 ; RV64-V512-NEXT:    vsrl.vi v11, v10, 1
-; RV64-V512-NEXT:    vmv.v.i v0, 10
 ; RV64-V512-NEXT:    vrgather.vv v10, v8, v11
+; RV64-V512-NEXT:    vmv.v.i v0, 10
 ; RV64-V512-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; RV64-V512-NEXT:    vmv.v.v v8, v10
 ; RV64-V512-NEXT:    ret
@@ -244,16 +244,17 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
 ; V128-NEXT:    slli a0, a0, 3
 ; V128-NEXT:    sub sp, sp, a0
 ; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; V128-NEXT:    vmv8r.v v0, v16
-; V128-NEXT:    addi a0, sp, 16
-; V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; V128-NEXT:    vmv8r.v v24, v16
 ; V128-NEXT:    vmv8r.v v16, v8
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT:    vslidedown.vi v8, v0, 16
+; V128-NEXT:    vmv8r.v v8, v24
+; V128-NEXT:    addi a0, sp, 16
+; V128-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; V128-NEXT:    vslidedown.vi v0, v24, 16
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v24, v0, v8
+; V128-NEXT:    vwaddu.vv v24, v8, v0
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v24, a0, v8
+; V128-NEXT:    vwmaccu.vx v24, a0, v0
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; V128-NEXT:    vslidedown.vi v0, v16, 16
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
index 85b849045e8cee..a8e4af2d7368e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
@@ -395,8 +395,8 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV32-NEXT:    fmin.d fa5, fa5, fa4
 ; RV32-NEXT:    fcvt.w.d a2, fa5, rtz
 ; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslide1down.vx v9, v9, a0
+; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
 ; RV32-NEXT:    vse8.v v9, (a1)
 ; RV32-NEXT:    addi sp, s0, -128
@@ -496,8 +496,8 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV64-NEXT:    fmin.d fa5, fa5, fa4
 ; RV64-NEXT:    fcvt.l.d a2, fa5, rtz
 ; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    vmv.v.i v0, 15
 ; RV64-NEXT:    vslide1down.vx v9, v9, a0
+; RV64-NEXT:    vmv.v.i v0, 15
 ; RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vse8.v v9, (a1)
 ; RV64-NEXT:    addi sp, s0, -128
@@ -580,8 +580,8 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV32-NEXT:    fmax.d fa4, fa4, fa3
 ; RV32-NEXT:    fmin.d fa5, fa4, fa5
 ; RV32-NEXT:    fcvt.wu.d a0, fa5, rtz
-; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslide1down.vx v9, v9, a0
+; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
 ; RV32-NEXT:    vse8.v v9, (a1)
 ; RV32-NEXT:    addi sp, s0, -128
@@ -656,8 +656,8 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV64-NEXT:    fmax.d fa4, fa4, fa3
 ; RV64-NEXT:    fmin.d fa5, fa4, fa5
 ; RV64-NEXT:    fcvt.lu.d a0, fa5, rtz
-; RV64-NEXT:    vmv.v.i v0, 15
 ; RV64-NEXT:    vslide1down.vx v9, v9, a0
+; RV64-NEXT:    vmv.v.i v0, 15
 ; RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vse8.v v9, (a1)
 ; RV64-NEXT:    addi sp, s0, -128
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-costrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-costrained-sdnode.ll
index f189354237ee3a..cb7ff2241f5aa9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-costrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-costrained-sdnode.ll
@@ -10,9 +10,9 @@ define <1 x half> @round_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: round_v1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -32,9 +32,9 @@ define <2 x half> @round_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: round_v2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -54,9 +54,9 @@ define <4 x half> @round_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: round_v4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -76,9 +76,9 @@ define <8 x half> @round_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: round_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -98,9 +98,9 @@ define <16 x half> @round_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: round_v16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -121,9 +121,9 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -253,9 +253,9 @@ define <1 x double> @round_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: round_v1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -275,9 +275,9 @@ define <2 x double> @round_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: round_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -297,9 +297,9 @@ define <4 x double> @round_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: round_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -319,9 +319,9 @@ define <8 x double> @round_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: round_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
index 11920c7c31c981..f5549adb740cc8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
@@ -10,9 +10,9 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -32,9 +32,9 @@ define <2 x half> @roundeven_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -54,9 +54,9 @@ define <4 x half> @roundeven_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -76,9 +76,9 @@ define <8 x half> @roundeven_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -98,9 +98,9 @@ define <16 x half> @roundeven_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -121,9 +121,9 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -253,9 +253,9 @@ define <1 x double> @roundeven_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_v1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -275,9 +275,9 @@ define <2 x double> @roundeven_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -297,9 +297,9 @@ define <4 x double> @roundeven_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -319,9 +319,9 @@ define <8 x double> @roundeven_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll
index a2ff77625b7584..d2cf4c11ac1a57 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll
@@ -680,14 +680,15 @@ define <16 x i64> @fshr_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
-; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vnot.v v8, v24, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vnot.v v24, v8, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vi v24, v24, 1, v0.t
-; CHECK-NEXT:    vsll.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -705,27 +706,38 @@ define <16 x i64> @fshl_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
-; CHECK-NEXT:    vsll.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vnot.v v16, v24, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
+; CHECK-NEXT:    vsll.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vnot.v v24, v8, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vi v24, v24, 1, v0.t
-; CHECK-NEXT:    vsrl.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
index f16581444afca5..2481dd4054c256 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
@@ -8,9 +8,9 @@ define <1 x half> @trunc_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -28,9 +28,9 @@ define <2 x half> @trunc_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -48,9 +48,9 @@ define <4 x half> @trunc_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -68,9 +68,9 @@ define <8 x half> @trunc_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -88,9 +88,9 @@ define <16 x half> @trunc_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -109,9 +109,9 @@ define <32 x half> @trunc_v32f16(<32 x half> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -229,9 +229,9 @@ define <1 x double> @trunc_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_v1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -249,9 +249,9 @@ define <2 x double> @trunc_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -269,9 +269,9 @@ define <4 x double> @trunc_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -289,9 +289,9 @@ define <8 x double> @trunc_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index 9f0240c53b219a..32e431838dffe9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -391,18 +391,18 @@ define void @insert_v8i1_v4i1_0(ptr %vp, ptr %svp) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a1)
+; CHECK-NEXT:    vlm.v v10, (a1)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmsne.vi v8, v9, 0
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    vsm.v v8, (a0)
 ; CHECK-NEXT:    ret
   %v = load <8 x i1>, ptr %vp
@@ -418,17 +418,17 @@ define void @insert_v8i1_v4i1_4(ptr %vp, ptr %svp) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a1)
+; CHECK-NEXT:    vlm.v v10, (a1)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v9, v8, 4
-; CHECK-NEXT:    vmsne.vi v8, v9, 0
+; CHECK-NEXT:    vslideup.vi v8, v9, 4
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    vsm.v v8, (a0)
 ; CHECK-NEXT:    ret
   %v = load <8 x i1>, ptr %vp
@@ -468,32 +468,32 @@ define <vscale x 2 x i1> @insert_nxv2i1_v4i1_0(<vscale x 2 x i1> %v, ptr %svp) {
 ; VLA-LABEL: insert_nxv2i1_v4i1_0:
 ; VLA:       # %bb.0:
 ; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vlm.v v10, (a0)
 ; VLA-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vmv1r.v v0, v10
 ; VLA-NEXT:    vmerge.vim v9, v9, 1, v0
-; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; VLA-NEXT:    vmv.v.i v10, 0
-; VLA-NEXT:    vmv1r.v v0, v8
-; VLA-NEXT:    vmerge.vim v8, v10, 1, v0
 ; VLA-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
-; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vmv.v.v v8, v9
 ; VLA-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; VLA-NEXT:    vmsne.vi v0, v9, 0
+; VLA-NEXT:    vmsne.vi v0, v8, 0
 ; VLA-NEXT:    ret
 ;
 ; VLS-LABEL: insert_nxv2i1_v4i1_0:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; VLS-NEXT:    vlm.v v8, (a0)
-; VLS-NEXT:    vmv.v.i v9, 0
-; VLS-NEXT:    vmerge.vim v10, v9, 1, v0
-; VLS-NEXT:    vmv1r.v v0, v8
-; VLS-NEXT:    vmerge.vim v8, v9, 1, v0
+; VLS-NEXT:    vlm.v v10, (a0)
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v9, v8, 1, v0
+; VLS-NEXT:    vmv1r.v v0, v10
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
 ; VLS-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
-; VLS-NEXT:    vmv.v.v v10, v8
+; VLS-NEXT:    vmv.v.v v9, v8
 ; VLS-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; VLS-NEXT:    vmsne.vi v0, v10, 0
+; VLS-NEXT:    vmsne.vi v0, v9, 0
 ; VLS-NEXT:    ret
   %sv = load <4 x i1>, ptr %svp
   %c = call <vscale x 2 x i1> @llvm.vector.insert.v4i1.nxv2i1(<vscale x 2 x i1> %v, <4 x i1> %sv, i64 0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index 6da83644413bc2..1a905e5476bba5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -70,8 +70,8 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; RV32-V512-NEXT:    vid.v v10
 ; RV32-V512-NEXT:    vsrl.vi v11, v10, 1
 ; RV32-V512-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV32-V512-NEXT:    vmv.v.i v0, 10
 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v8, v11
+; RV32-V512-NEXT:    vmv.v.i v0, 10
 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v9, v11, v0.t
 ; RV32-V512-NEXT:    vmv.v.v v8, v10
 ; RV32-V512-NEXT:    ret
@@ -81,8 +81,8 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, mu
 ; RV64-V512-NEXT:    vid.v v10
 ; RV64-V512-NEXT:    vsrl.vi v11, v10, 1
-; RV64-V512-NEXT:    vmv.v.i v0, 10
 ; RV64-V512-NEXT:    vrgather.vv v10, v8, v11
+; RV64-V512-NEXT:    vmv.v.i v0, 10
 ; RV64-V512-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; RV64-V512-NEXT:    vmv.v.v v8, v10
 ; RV64-V512-NEXT:    ret
@@ -195,8 +195,8 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
 ; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; V128-NEXT:    vid.v v8
 ; V128-NEXT:    vsrl.vi v8, v8, 1
-; V128-NEXT:    vmv.v.i v0, 10
 ; V128-NEXT:    vadd.vi v8, v8, 1
+; V128-NEXT:    vmv.v.i v0, 10
 ; V128-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; V128-NEXT:    vmv.v.v v8, v10
 ; V128-NEXT:    ret
@@ -210,8 +210,8 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
 ; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, mu
 ; V512-NEXT:    vid.v v8
 ; V512-NEXT:    vsrl.vi v8, v8, 1
-; V512-NEXT:    vmv.v.i v0, 10
 ; V512-NEXT:    vadd.vi v8, v8, 1
+; V512-NEXT:    vmv.v.i v0, 10
 ; V512-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; V512-NEXT:    vmv1r.v v8, v10
 ; V512-NEXT:    ret
@@ -409,16 +409,17 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
 ; V128-NEXT:    slli a0, a0, 3
 ; V128-NEXT:    sub sp, sp, a0
 ; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; V128-NEXT:    vmv8r.v v0, v16
-; V128-NEXT:    addi a0, sp, 16
-; V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; V128-NEXT:    vmv8r.v v24, v16
 ; V128-NEXT:    vmv8r.v v16, v8
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT:    vslidedown.vi v8, v0, 16
+; V128-NEXT:    vmv8r.v v8, v24
+; V128-NEXT:    addi a0, sp, 16
+; V128-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; V128-NEXT:    vslidedown.vi v0, v24, 16
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v24, v0, v8
+; V128-NEXT:    vwaddu.vv v24, v8, v0
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v24, a0, v8
+; V128-NEXT:    vwmaccu.vx v24, a0, v0
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; V128-NEXT:    vslidedown.vi v0, v16, 16
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 0e8d9cf0306690..33922193492e2e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -89,8 +89,8 @@ define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI6_0)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vle16.v v11, (a0)
-; CHECK-NEXT:    vmv.v.i v0, 8
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vmv.v.i v0, 8
 ; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -162,16 +162,16 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v16, 2
-; RV32-NEXT:    li a0, 5
-; RV32-NEXT:    vslide1down.vx v20, v16, a0
 ; RV32-NEXT:    lui a0, %hi(.LCPI11_0)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI11_0)
-; RV32-NEXT:    vle16.v v21, (a0)
+; RV32-NEXT:    vle16.v v20, (a0)
+; RV32-NEXT:    li a0, 5
+; RV32-NEXT:    vslide1down.vx v21, v16, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v20
 ; RV32-NEXT:    li a0, 164
 ; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v21
-; RV32-NEXT:    vrgatherei16.vv v16, v12, v20, v0.t
+; RV32-NEXT:    vrgatherei16.vv v16, v12, v21, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    ret
 ;
@@ -213,9 +213,9 @@ define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) {
 ; RV32-NEXT:    lui a0, %hi(.LCPI12_1)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI12_1)
 ; RV32-NEXT:    vle16.v v17, (a0)
+; RV32-NEXT:    vrgatherei16.vv v12, v20, v16
 ; RV32-NEXT:    li a0, 113
 ; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vrgatherei16.vv v12, v20, v16
 ; RV32-NEXT:    vrgatherei16.vv v12, v8, v17, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
@@ -368,9 +368,9 @@ define <8 x i8> @splat_ve2_we0(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vrgather.vi v10, v8, 2
 ; CHECK-NEXT:    li a0, 66
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vi v10, v8, 2
 ; CHECK-NEXT:    vrgather.vi v10, v9, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -387,9 +387,9 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, ma
 ; CHECK-NEXT:    vmv.s.x v11, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    li a0, 66
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    vrgather.vi v10, v9, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -422,9 +422,9 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v11, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    li a0, 66
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    vrgather.vi v10, v9, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -441,9 +441,9 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vsetivli zero, 3, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v11, v10, 2
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vrgather.vi v10, v8, 2
 ; CHECK-NEXT:    li a0, 70
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vi v10, v8, 2
 ; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -464,9 +464,9 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    li a0, 98
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -667,9 +667,9 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: merge_slidedown:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    li a0, 195
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 14, i32 15>
@@ -681,9 +681,9 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
 ; CHECK-LABEL: merge_non_contiguous_slideup_slidedown:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    li a0, 234
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 10, i32 6, i32 12, i32 13, i32 14>
@@ -698,9 +698,9 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    lui a0, %hi(.LCPI46_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI46_0)
 ; CHECK-NEXT:    vle8.v v10, (a0)
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    li a0, 234
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 03e99baf91c08e..8a527a4f24e757 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -3315,40 +3315,40 @@ define void @mulhu_v16i16(ptr %x) {
 ; RV32-LABEL: mulhu_v16i16:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v10, (a0)
+; RV32-NEXT:    vle16.v v8, (a0)
 ; RV32-NEXT:    li a1, 257
 ; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmv.v.i v10, 0
 ; RV32-NEXT:    lui a1, 1048568
-; RV32-NEXT:    vmerge.vxm v12, v8, a1, v0
+; RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; RV32-NEXT:    lui a1, 4
 ; RV32-NEXT:    addi a1, a1, 64
-; RV32-NEXT:    vmv.s.x v8, a1
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV32-NEXT:    vmv1r.v v18, v0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV32-NEXT:    lui a1, %hi(.LCPI182_0)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI182_0)
 ; RV32-NEXT:    vle16.v v14, (a1)
-; RV32-NEXT:    vsext.vf2 v16, v9
-; RV32-NEXT:    vsrl.vv v16, v10, v16
-; RV32-NEXT:    vmulhu.vv v14, v16, v14
-; RV32-NEXT:    vsub.vv v10, v10, v14
-; RV32-NEXT:    vmulhu.vv v10, v10, v12
-; RV32-NEXT:    vadd.vv v10, v10, v14
+; RV32-NEXT:    vsext.vf2 v16, v12
+; RV32-NEXT:    vsrl.vv v12, v8, v16
+; RV32-NEXT:    vmulhu.vv v12, v12, v14
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vmulhu.vv v8, v8, v10
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 2
 ; RV32-NEXT:    addi a1, a1, 289
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v9, 3
-; RV32-NEXT:    vmerge.vim v9, v9, 2, v0
-; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v9, 1, v0
+; RV32-NEXT:    vmv.v.i v10, 3
+; RV32-NEXT:    vmerge.vim v10, v10, 2, v0
+; RV32-NEXT:    vmv1r.v v0, v18
+; RV32-NEXT:    vmerge.vim v10, v10, 1, v0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsrl.vv v8, v10, v12
+; RV32-NEXT:    vsext.vf2 v12, v10
+; RV32-NEXT:    vsrl.vv v8, v8, v12
 ; RV32-NEXT:    vse16.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index f98cb343a2ab42..2781a153b2b409 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -176,27 +176,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vslideup.vi v8, v16, 4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    li a5, 12
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 12
-; RV32-NEXT:    vmv.s.x v3, a4
+; RV32-NEXT:    vmv.s.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v16, v16, 16
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vmv1r.v v0, v3
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs1r.v v3, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vslideup.vi v8, v16, 10, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    li a5, 20
@@ -204,12 +199,18 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v3, v0
 ; RV32-NEXT:    lui a4, %hi(.LCPI6_0)
 ; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vle16.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    slli a4, a4, 2
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
@@ -221,14 +222,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v16, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v24, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -236,13 +237,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a5, -64
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
@@ -271,60 +271,58 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslideup.vi v8, v16, 2
-; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v8, v16, 8, v0.t
-; RV32-NEXT:    vmv.v.v v20, v8
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vslideup.vi v8, v24, 8, v0.t
+; RV32-NEXT:    vmv.v.v v16, v8
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_3)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_3)
-; RV32-NEXT:    vle16.v v24, (a1)
+; RV32-NEXT:    vle16.v v20, (a1)
 ; RV32-NEXT:    vle16.v v8, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
+; RV32-NEXT:    vrgatherei16.vv v8, v0, v20
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
+; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v20, v8
+; RV32-NEXT:    vmv.v.v v16, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_4)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_4)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
@@ -333,57 +331,53 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v12, v24, v8
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v12, v16, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v3, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslideup.vi v12, v16, 6, v0.t
-; RV32-NEXT:    vmv.v.v v4, v12
+; RV32-NEXT:    vmv.v.v v16, v12
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_5)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_5)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_6)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_6)
-; RV32-NEXT:    vle16.v v24, (a1)
-; RV32-NEXT:    vle16.v v8, (a3)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v20, (a1)
+; RV32-NEXT:    vle16.v v4, (a3)
 ; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v2, a1
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v24
-; RV32-NEXT:    vmv1r.v v0, v2
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v20
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v4, v8
+; RV32-NEXT:    vmv.v.v v16, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
@@ -392,177 +386,189 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v28, v24, v8
-; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v12, v16, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v28, v8, 4, v0.t
-; RV32-NEXT:    vmv.v.v v4, v28
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v12, v16, 4, v0.t
+; RV32-NEXT:    vmv.v.v v4, v12
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_8)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_8)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_9)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_9)
-; RV32-NEXT:    vle16.v v28, (a1)
-; RV32-NEXT:    vle16.v v24, (a3)
+; RV32-NEXT:    vle16.v v0, (a1)
+; RV32-NEXT:    vle16.v v8, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v28
-; RV32-NEXT:    vmv1r.v v0, v2
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v4, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_10)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_10)
-; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    lui a1, 15
-; RV32-NEXT:    vmv.s.x v6, a1
+; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v28, v24, 6
-; RV32-NEXT:    vmv1r.v v0, v6
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v24, v16, 6
+; RV32-NEXT:    lui a1, 15
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v28, v8, v4, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
+; RV32-NEXT:    vmv1r.v v3, v0
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_11)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_11)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_12)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_12)
-; RV32-NEXT:    vle16.v v0, (a1)
-; RV32-NEXT:    vle16.v v24, (a3)
+; RV32-NEXT:    vle16.v v4, (a1)
+; RV32-NEXT:    vle16.v v28, (a3)
 ; RV32-NEXT:    li a1, 1008
-; RV32-NEXT:    vmv.s.x v7, a1
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v7, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v0
-; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v4
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v28, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v28, v8
+; RV32-NEXT:    vmv.v.v v24, v16
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_13)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_13)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    vmv1r.v v0, v6
+; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v12, v0.t
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_14)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_14)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a2, %hi(.LCPI6_15)
 ; RV32-NEXT:    addi a2, a2, %lo(.LCPI6_15)
-; RV32-NEXT:    vle16.v v16, (a1)
-; RV32-NEXT:    vle16.v v8, (a2)
+; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    vle16.v v16, (a2)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
+; RV32-NEXT:    vrgatherei16.vv v16, v0, v12
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
+; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v16, v24, v12, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v24, v8
+; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    addi a1, a0, 320
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v24, (a1)
+; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    vse32.v v28, (a1)
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 192
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    slli a2, a2, 2
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 12
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
@@ -586,236 +592,289 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 52
+; RV64-NEXT:    li a3, 54
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x34, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 52 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x36, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 54 * vlenb
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    addi a2, a1, 256
-; RV64-NEXT:    vle64.v v16, (a2)
+; RV64-NEXT:    vle64.v v24, (a2)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 27
+; RV64-NEXT:    li a3, 30
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    addi a2, a1, 128
 ; RV64-NEXT:    vle64.v v8, (a2)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 35
+; RV64-NEXT:    li a3, 46
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vle64.v v8, (a1)
+; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 43
+; RV64-NEXT:    li a2, 38
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vrgather.vi v8, v16, 4
+; RV64-NEXT:    vrgather.vi v8, v24, 4
 ; RV64-NEXT:    li a1, 128
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v16, 8
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vmv1r.v v28, v0
+; RV64-NEXT:    vmv.s.x v12, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vrgather.vi v8, v24, 2, v0.t
+; RV64-NEXT:    vs1r.v v12, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v24, 8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 19
+; RV64-NEXT:    li a2, 21
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.v.v v4, v8
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vi v8, v24, 2, v0.t
+; RV64-NEXT:    vmv.v.v v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    li a1, 6
 ; RV64-NEXT:    vid.v v8
-; RV64-NEXT:    vmul.vx v2, v8, a1
+; RV64-NEXT:    vmul.vx v28, v8, a1
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v8, v16, v28
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    vmv.s.x v16, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 43
+; RV64-NEXT:    li a2, 13
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v2
-; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vmv.s.x v1, a1
-; RV64-NEXT:    vadd.vi v30, v2, -16
+; RV64-NEXT:    vs1r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vadd.vi v30, v28, -16
+; RV64-NEXT:    vmv.v.v v6, v28
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 35
+; RV64-NEXT:    li a2, 46
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vrgatherei16.vv v8, v16, v30, v0.t
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v4, v8
+; RV64-NEXT:    vmv.v.v v24, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 4
-; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 27
+; RV64-NEXT:    li a2, 30
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v4, v16, 5
-; RV64-NEXT:    vmv1r.v v0, v28
-; RV64-NEXT:    vrgather.vi v4, v24, 3, v0.t
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs2r.v v2, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vadd.vi v16, v2, 1
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vi v8, v24, 5
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 43
+; RV64-NEXT:    li a2, 21
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v16
-; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v16, v2, -15
+; RV64-NEXT:    vrgather.vi v8, v24, 3, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV64-NEXT:    vadd.vi v8, v6, 1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 11
+; RV64-NEXT:    li a2, 38
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs2r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v8
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vadd.vi v18, v6, -15
+; RV64-NEXT:    vmv2r.v v16, v6
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 35
+; RV64-NEXT:    li a2, 46
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 11
+; RV64-NEXT:    li a2, 13
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl2r.v v2, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v2, v0.t
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v24, v8, v18, v0.t
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v4, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 11
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v8, v24
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl2r.v v2, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vadd.vi v6, v2, 2
+; RV64-NEXT:    vmv2r.v v30, v6
+; RV64-NEXT:    vadd.vi v20, v6, 2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v6
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 38
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v0, v20
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64-NEXT:    li a1, 24
-; RV64-NEXT:    vmv.s.x v7, a1
-; RV64-NEXT:    vadd.vi v26, v2, -14
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 2
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vadd.vi v20, v30, -14
+; RV64-NEXT:    vmv2r.v v22, v30
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v26, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 46
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v12, 6
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 27
+; RV64-NEXT:    li a2, 30
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vrgatherei16.vv v16, v24, v12
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl1r.v v6, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv1r.v v0, v6
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 19
+; RV64-NEXT:    li a2, 29
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vrgather.vi v16, v24, 4, v0.t
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
 ; RV64-NEXT:    vmv.v.v v16, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v28, v2, 3
+; RV64-NEXT:    vadd.vi v20, v22, 3
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 43
+; RV64-NEXT:    li a2, 38
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v28
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v0, v20
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v28, v2, -13
+; RV64-NEXT:    vadd.vi v20, v22, -13
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 35
+; RV64-NEXT:    li a2, 46
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 2
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV64-NEXT:    lui a1, 16
 ; RV64-NEXT:    addi a1, a1, 7
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 27
+; RV64-NEXT:    li a2, 30
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v12
-; RV64-NEXT:    vmv1r.v v0, v6
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv4r.v v16, v0
+; RV64-NEXT:    vrgatherei16.vv v4, v0, v12
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 19
+; RV64-NEXT:    li a2, 21
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v16, v24, 5, v0.t
+; RV64-NEXT:    vrgather.vi v4, v24, 5, v0.t
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v16, v8
+; RV64-NEXT:    vmv.v.v v4, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 1
+; RV64-NEXT:    slli a2, a1, 2
 ; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    lui a1, 96
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a1
@@ -823,98 +882,115 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    li a1, 192
 ; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    slli a1, a1, 2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 27
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v4, v16, 2
-; RV64-NEXT:    vrgatherei16.vv v4, v24, v8, v0.t
+; RV64-NEXT:    vrgather.vi v12, v16, 2
+; RV64-NEXT:    vrgatherei16.vv v12, v24, v8, v0.t
+; RV64-NEXT:    vmv.v.v v16, v12
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v16, v2, 4
+; RV64-NEXT:    vadd.vi v20, v22, 4
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 43
+; RV64-NEXT:    li a2, 38
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v16
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v0, v20
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64-NEXT:    li a1, 28
-; RV64-NEXT:    vmv.s.x v1, a1
-; RV64-NEXT:    vadd.vi v16, v2, -12
+; RV64-NEXT:    vmv.s.x v20, a1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs1r.v v20, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vadd.vi v20, v22, -12
+; RV64-NEXT:    vmv2r.v v6, v22
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 35
+; RV64-NEXT:    li a2, 46
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v4, v8
+; RV64-NEXT:    vmv.v.v v16, v8
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    lui a1, 112
 ; RV64-NEXT:    addi a1, a1, 1
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v12, a1
+; RV64-NEXT:    vmv.v.x v8, a1
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 27
+; RV64-NEXT:    li a2, 30
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v8, v16, 3
+; RV64-NEXT:    vrgather.vi v12, v16, 3
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    slli a1, a1, 2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 19
+; RV64-NEXT:    li a2, 21
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v12, v0.t
+; RV64-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v12, v2, 5
+; RV64-NEXT:    vmv2r.v v10, v6
+; RV64-NEXT:    vadd.vi v8, v6, 5
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 43
+; RV64-NEXT:    li a2, 38
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v12
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v16, v0, v8
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v12, v2, -11
+; RV64-NEXT:    vadd.vi v8, v10, -11
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 35
+; RV64-NEXT:    li a2, 46
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v12, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v8, v16
+; RV64-NEXT:    vmv.v.v v12, v16
 ; RV64-NEXT:    addi a1, a0, 320
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vse64.v v12, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
-; RV64-NEXT:    vse64.v v4, (a1)
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    addi a1, a0, 192
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 1
+; RV64-NEXT:    slli a3, a2, 2
 ; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
@@ -922,29 +998,29 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 3
-; RV64-NEXT:    sub a2, a3, a2
+; RV64-NEXT:    li a3, 13
+; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    addi a1, a0, 64
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 11
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a3, a2, 4
+; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 4
-; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 52
+; RV64-NEXT:    li a1, 54
 ; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
index c295fed2c28c10..8986c876904b6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
@@ -141,8 +141,8 @@ define <4 x i1> @buildvec_mask_nonconst_v4i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: buildvec_mask_nonconst_v4i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 3
 ; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    vmv.v.i v0, 3
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -151,8 +151,8 @@ define <4 x i1> @buildvec_mask_nonconst_v4i1(i1 %x, i1 %y) {
 ; ZVE32F-LABEL: buildvec_mask_nonconst_v4i1:
 ; ZVE32F:       # %bb.0:
 ; ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32F-NEXT:    vmv.v.i v0, 3
 ; ZVE32F-NEXT:    vmv.v.x v8, a1
+; ZVE32F-NEXT:    vmv.v.i v0, 3
 ; ZVE32F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
 ; ZVE32F-NEXT:    vmsne.vi v0, v8, 0
@@ -246,9 +246,9 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: buildvec_mask_nonconst_v8i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    li a2, 19
-; CHECK-NEXT:    vmv.s.x v0, a2
 ; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    li a1, 19
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -257,9 +257,9 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1(i1 %x, i1 %y) {
 ; ZVE32F-LABEL: buildvec_mask_nonconst_v8i1:
 ; ZVE32F:       # %bb.0:
 ; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; ZVE32F-NEXT:    li a2, 19
-; ZVE32F-NEXT:    vmv.s.x v0, a2
 ; ZVE32F-NEXT:    vmv.v.x v8, a1
+; ZVE32F-NEXT:    li a1, 19
+; ZVE32F-NEXT:    vmv.s.x v0, a1
 ; ZVE32F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
 ; ZVE32F-NEXT:    vmsne.vi v0, v8, 0
@@ -286,8 +286,8 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) {
 ; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, zero
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -303,8 +303,8 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) {
 ; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, zero
-; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
 ; ZVE32F-NEXT:    vmsne.vi v0, v8, 0
@@ -331,8 +331,8 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %
 ; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, zero
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -348,8 +348,8 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %
 ; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, zero
-; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
 ; ZVE32F-NEXT:    vmsne.vi v0, v8, 0
@@ -375,8 +375,8 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize {
 ; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -391,8 +391,8 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize {
 ; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
-; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
 ; ZVE32F-NEXT:    vmsne.vi v0, v8, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll
index 4f7b885d998e5b..7fc442c88d101b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll
@@ -25,10 +25,10 @@ define void @splat_v1i1(ptr %x, i1 %y) {
 ; CHECK-LABEL: splat_v1i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    andi a1, a1, 1
-; CHECK-NEXT:    vmv.s.x v8, a1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    andi a1, a1, 1
+; CHECK-NEXT:    vmv.s.x v9, a1
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 9fbc22221f99bd..be9b5a75a5f496 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -12631,8 +12631,8 @@ define <4 x i32> @mgather_narrow_edge_case(ptr %base) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, -512
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    vmerge.vim v8, v8, 0, v0
 ; RV32-NEXT:    vluxei32.v v8, (a0), v8
 ; RV32-NEXT:    ret
@@ -12728,8 +12728,8 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32-NEXT:    vslide1down.vx v8, v8, a6
-; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -12803,8 +12803,8 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV64V-NEXT:    vmv.v.x v8, a3
 ; RV64V-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64V-NEXT:    vslide1down.vx v8, v8, a6
-; RV64V-NEXT:    vmv.v.i v0, 15
 ; RV64V-NEXT:    vslide1down.vx v8, v8, a7
+; RV64V-NEXT:    vmv.v.i v0, 15
 ; RV64V-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV64V-NEXT:    addi sp, s0, -128
 ; RV64V-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
@@ -12854,8 +12854,8 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -12896,8 +12896,8 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
@@ -12941,8 +12941,8 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 2, i64 3, i64 6, i64 7, i64 10, i64 11, i64 14, i64 15>
@@ -12986,8 +12986,8 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 14, i64 15, i64 12, i64 13, i64 10, i64 11, i64 8, i64 9>
@@ -13031,8 +13031,8 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 14, i64 15, i64 10, i64 11, i64 6, i64 7, i64 2, i64 3>
@@ -13074,8 +13074,8 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 2, i32 3>
@@ -13120,8 +13120,8 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 2, i32 3>
@@ -13167,8 +13167,8 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned2(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 1, i32 2, i32 9, i32 10, i32 4, i32 5, i32 2, i32 3>
@@ -13217,8 +13217,8 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
@@ -13264,8 +13264,8 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
@@ -13320,8 +13320,8 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a2
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a3
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 4, i64 5, i64 6, i64 7, i64 0, i64 1, i64 2, i64 3>
@@ -13367,8 +13367,8 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 0, i64 2, i64 3, i64 1, i64 4, i64 5, i64 6, i64 7>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
index 79b05334cb1f92..c6665c4041b987 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
@@ -365,15 +365,15 @@ define void @masked_load_v32f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi a3, a1, 128
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v16, (a1)
-; RV32-NEXT:    vle64.v v24, (a3)
+; RV32-NEXT:    vle64.v v8, (a1)
+; RV32-NEXT:    vle64.v v16, (a3)
 ; RV32-NEXT:    fcvt.d.w fa5, zero
-; RV32-NEXT:    vmfeq.vf v8, v16, fa5
-; RV32-NEXT:    vmfeq.vf v0, v24, fa5
-; RV32-NEXT:    addi a1, a0, 128
-; RV32-NEXT:    vle64.v v16, (a1), v0.t
-; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vmfeq.vf v0, v8, fa5
+; RV32-NEXT:    vmfeq.vf v24, v16, fa5
 ; RV32-NEXT:    vle64.v v8, (a0), v0.t
+; RV32-NEXT:    addi a0, a0, 128
+; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vle64.v v16, (a0), v0.t
 ; RV32-NEXT:    vse64.v v8, (a2)
 ; RV32-NEXT:    addi a0, a2, 128
 ; RV32-NEXT:    vse64.v v16, (a0)
@@ -383,15 +383,15 @@ define void @masked_load_v32f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi a3, a1, 128
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v16, (a1)
-; RV64-NEXT:    vle64.v v24, (a3)
+; RV64-NEXT:    vle64.v v8, (a1)
+; RV64-NEXT:    vle64.v v16, (a3)
 ; RV64-NEXT:    fmv.d.x fa5, zero
-; RV64-NEXT:    vmfeq.vf v8, v16, fa5
-; RV64-NEXT:    vmfeq.vf v0, v24, fa5
-; RV64-NEXT:    addi a1, a0, 128
-; RV64-NEXT:    vle64.v v16, (a1), v0.t
-; RV64-NEXT:    vmv1r.v v0, v8
+; RV64-NEXT:    vmfeq.vf v0, v8, fa5
+; RV64-NEXT:    vmfeq.vf v24, v16, fa5
 ; RV64-NEXT:    vle64.v v8, (a0), v0.t
+; RV64-NEXT:    addi a0, a0, 128
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vle64.v v16, (a0), v0.t
 ; RV64-NEXT:    vse64.v v8, (a2)
 ; RV64-NEXT:    addi a0, a2, 128
 ; RV64-NEXT:    vse64.v v16, (a0)
@@ -429,15 +429,15 @@ define void @masked_load_v64f32(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 ; CHECK-NEXT:    addi a3, a1, 128
 ; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a1)
-; CHECK-NEXT:    vle32.v v24, (a3)
+; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vle32.v v16, (a3)
 ; CHECK-NEXT:    fmv.w.x fa5, zero
-; CHECK-NEXT:    vmfeq.vf v8, v16, fa5
-; CHECK-NEXT:    vmfeq.vf v0, v24, fa5
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vle32.v v16, (a1), v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
+; CHECK-NEXT:    vmfeq.vf v24, v16, fa5
 ; CHECK-NEXT:    vle32.v v8, (a0), v0.t
+; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vle32.v v16, (a0), v0.t
 ; CHECK-NEXT:    vse32.v v8, (a2)
 ; CHECK-NEXT:    addi a0, a2, 128
 ; CHECK-NEXT:    vse32.v v16, (a0)
@@ -456,15 +456,15 @@ define void @masked_load_v128f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 ; CHECK-NEXT:    addi a3, a1, 128
 ; CHECK-NEXT:    li a4, 64
 ; CHECK-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
-; CHECK-NEXT:    vle16.v v16, (a1)
-; CHECK-NEXT:    vle16.v v24, (a3)
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v16, (a3)
 ; CHECK-NEXT:    fmv.h.x fa5, zero
-; CHECK-NEXT:    vmfeq.vf v8, v16, fa5
-; CHECK-NEXT:    vmfeq.vf v0, v24, fa5
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vle16.v v16, (a1), v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
+; CHECK-NEXT:    vmfeq.vf v24, v16, fa5
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
+; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vle16.v v16, (a0), v0.t
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    addi a0, a2, 128
 ; CHECK-NEXT:    vse16.v v16, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
index 4bbda2152a6f97..b6568fb46b8387 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
@@ -401,18 +401,18 @@ define void @masked_load_v32i64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi a3, a1, 128
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v16, (a3)
-; RV32-NEXT:    vle64.v v0, (a1)
+; RV32-NEXT:    vle64.v v8, (a3)
+; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v24, 0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vmseq.vv v8, v0, v24
 ; RV32-NEXT:    vmseq.vv v0, v16, v24
-; RV32-NEXT:    addi a1, a0, 128
-; RV32-NEXT:    vle64.v v16, (a1), v0.t
-; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vmseq.vv v16, v8, v24
 ; RV32-NEXT:    vle64.v v8, (a0), v0.t
+; RV32-NEXT:    addi a0, a0, 128
+; RV32-NEXT:    vmv1r.v v0, v16
+; RV32-NEXT:    vle64.v v16, (a0), v0.t
 ; RV32-NEXT:    vse64.v v8, (a2)
 ; RV32-NEXT:    addi a0, a2, 128
 ; RV32-NEXT:    vse64.v v16, (a0)
@@ -422,14 +422,14 @@ define void @masked_load_v32i64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi a3, a1, 128
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v16, (a1)
-; RV64-NEXT:    vle64.v v24, (a3)
-; RV64-NEXT:    vmseq.vi v8, v16, 0
-; RV64-NEXT:    vmseq.vi v0, v24, 0
-; RV64-NEXT:    addi a1, a0, 128
-; RV64-NEXT:    vle64.v v16, (a1), v0.t
-; RV64-NEXT:    vmv1r.v v0, v8
+; RV64-NEXT:    vle64.v v8, (a1)
+; RV64-NEXT:    vle64.v v16, (a3)
+; RV64-NEXT:    vmseq.vi v0, v8, 0
+; RV64-NEXT:    vmseq.vi v24, v16, 0
 ; RV64-NEXT:    vle64.v v8, (a0), v0.t
+; RV64-NEXT:    addi a0, a0, 128
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vle64.v v16, (a0), v0.t
 ; RV64-NEXT:    vse64.v v8, (a2)
 ; RV64-NEXT:    addi a0, a2, 128
 ; RV64-NEXT:    vse64.v v16, (a0)
@@ -484,14 +484,14 @@ define void @masked_load_v64i32(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 ; CHECK-NEXT:    addi a3, a1, 128
 ; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a1)
-; CHECK-NEXT:    vle32.v v24, (a3)
-; CHECK-NEXT:    vmseq.vi v8, v16, 0
-; CHECK-NEXT:    vmseq.vi v0, v24, 0
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vle32.v v16, (a1), v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vle32.v v16, (a3)
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vmseq.vi v24, v16, 0
 ; CHECK-NEXT:    vle32.v v8, (a0), v0.t
+; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vle32.v v16, (a0), v0.t
 ; CHECK-NEXT:    vse32.v v8, (a2)
 ; CHECK-NEXT:    addi a0, a2, 128
 ; CHECK-NEXT:    vse32.v v16, (a0)
@@ -528,14 +528,14 @@ define void @masked_load_v256i8(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 ; CHECK-NEXT:    addi a3, a1, 128
 ; CHECK-NEXT:    li a4, 128
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a1)
-; CHECK-NEXT:    vle8.v v24, (a3)
-; CHECK-NEXT:    vmseq.vi v8, v16, 0
-; CHECK-NEXT:    vmseq.vi v0, v24, 0
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vle8.v v16, (a1), v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vle8.v v8, (a1)
+; CHECK-NEXT:    vle8.v v16, (a3)
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vmseq.vi v24, v16, 0
 ; CHECK-NEXT:    vle8.v v8, (a0), v0.t
+; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vle8.v v16, (a0), v0.t
 ; CHECK-NEXT:    vse8.v v8, (a2)
 ; CHECK-NEXT:    addi a0, a2, 128
 ; CHECK-NEXT:    vse8.v v16, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
index a1e81ea41c249b..38cd8310bad521 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
@@ -363,82 +363,38 @@ declare void @llvm.masked.store.v32f32.p0v32f32(<32 x float>, <32 x float>*, i32
 define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32 x double>* %m_ptr) nounwind {
 ; RV32-LABEL: masked_store_v32f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    sub sp, sp, a3
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    addi a3, a2, 128
+; RV32-NEXT:    vle64.v v16, (a3)
 ; RV32-NEXT:    vle64.v v8, (a2)
-; RV32-NEXT:    addi a2, a2, 128
-; RV32-NEXT:    vle64.v v16, (a2)
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    fcvt.d.w fa5, zero
-; RV32-NEXT:    vmfeq.vf v0, v8, fa5
-; RV32-NEXT:    vle64.v v24, (a0)
+; RV32-NEXT:    vmfeq.vf v7, v16, fa5
+; RV32-NEXT:    vle64.v v16, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vmfeq.vf v8, v16, fa5
-; RV32-NEXT:    vse64.v v24, (a1), v0.t
+; RV32-NEXT:    vle64.v v24, (a0)
+; RV32-NEXT:    vmfeq.vf v0, v8, fa5
+; RV32-NEXT:    vse64.v v16, (a1), v0.t
 ; RV32-NEXT:    addi a0, a1, 128
-; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vse64.v v8, (a0), v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    vse64.v v24, (a0), v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: masked_store_v32f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a3, a3, 4
-; RV64-NEXT:    sub sp, sp, a3
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    addi a3, a2, 128
+; RV64-NEXT:    vle64.v v16, (a3)
 ; RV64-NEXT:    vle64.v v8, (a2)
-; RV64-NEXT:    addi a2, a2, 128
-; RV64-NEXT:    vle64.v v16, (a2)
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    fmv.d.x fa5, zero
-; RV64-NEXT:    vmfeq.vf v0, v8, fa5
-; RV64-NEXT:    vle64.v v24, (a0)
+; RV64-NEXT:    vmfeq.vf v7, v16, fa5
+; RV64-NEXT:    vle64.v v16, (a0)
 ; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vmfeq.vf v8, v16, fa5
-; RV64-NEXT:    vse64.v v24, (a1), v0.t
+; RV64-NEXT:    vle64.v v24, (a0)
+; RV64-NEXT:    vmfeq.vf v0, v8, fa5
+; RV64-NEXT:    vse64.v v16, (a1), v0.t
 ; RV64-NEXT:    addi a0, a1, 128
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vse64.v v8, (a0), v0.t
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vse64.v v24, (a0), v0.t
 ; RV64-NEXT:    ret
   %m = load <32 x double>, <32 x double>* %m_ptr
   %mask = fcmp oeq <32 x double> %m, zeroinitializer
@@ -470,43 +426,21 @@ declare void @llvm.masked.store.v64f16.p0v64f16(<64 x half>, <64 x half>*, i32,
 define void @masked_store_v64f32(<64 x float>* %val_ptr, <64 x float>* %a, <64 x float>* %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v64f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    sub sp, sp, a3
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    addi a3, a2, 128
+; CHECK-NEXT:    vle32.v v16, (a3)
 ; CHECK-NEXT:    vle32.v v8, (a2)
-; CHECK-NEXT:    addi a2, a2, 128
-; CHECK-NEXT:    vle32.v v16, (a2)
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fmv.w.x fa5, zero
-; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
-; CHECK-NEXT:    vle32.v v24, (a0)
+; CHECK-NEXT:    vmfeq.vf v7, v16, fa5
+; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vf v8, v16, fa5
-; CHECK-NEXT:    vse32.v v24, (a1), v0.t
+; CHECK-NEXT:    vle32.v v24, (a0)
+; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
+; CHECK-NEXT:    vse32.v v16, (a1), v0.t
 ; CHECK-NEXT:    addi a0, a1, 128
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vse32.v v8, (a0), v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vse32.v v24, (a0), v0.t
 ; CHECK-NEXT:    ret
   %m = load <64 x float>, <64 x float>* %m_ptr
   %mask = fcmp oeq <64 x float> %m, zeroinitializer
@@ -519,43 +453,21 @@ declare void @llvm.masked.store.v64f16.p0v64f32(<64 x float>, <64 x float>*, i32
 define void @masked_store_v128f16(<128 x half>* %val_ptr, <128 x half>* %a, <128 x half>* %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v128f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    sub sp, sp, a3
 ; CHECK-NEXT:    li a3, 64
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
+; CHECK-NEXT:    addi a3, a2, 128
+; CHECK-NEXT:    vle16.v v16, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    addi a2, a2, 128
-; CHECK-NEXT:    vle16.v v16, (a2)
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fmv.h.x fa5, zero
-; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
-; CHECK-NEXT:    vle16.v v24, (a0)
+; CHECK-NEXT:    vmfeq.vf v7, v16, fa5
+; CHECK-NEXT:    vle16.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vf v8, v16, fa5
-; CHECK-NEXT:    vse16.v v24, (a1), v0.t
+; CHECK-NEXT:    vle16.v v24, (a0)
+; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
+; CHECK-NEXT:    vse16.v v16, (a1), v0.t
 ; CHECK-NEXT:    addi a0, a1, 128
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vse16.v v8, (a0), v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vse16.v v24, (a0), v0.t
 ; CHECK-NEXT:    ret
   %m = load <128 x half>, <128 x half>* %m_ptr
   %mask = fcmp oeq <128 x half> %m, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
index b3011d0f01cab1..d3676d83a4f7c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
@@ -401,95 +401,60 @@ define void @masked_store_v32i64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 18
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    sub sp, sp, a3
 ; RV32-NEXT:    addi a3, a2, 128
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a3)
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a4, a3, 3
-; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vle64.v v0, (a2)
+; RV32-NEXT:    vle64.v v24, (a2)
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, 0
+; RV32-NEXT:    vmv.v.i v16, 0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vmseq.vv v8, v0, v24
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vmseq.vv v7, v24, v16
 ; RV32-NEXT:    addi a2, a0, 128
-; RV32-NEXT:    vle64.v v8, (a2)
-; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    vle64.v v24, (a2)
+; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a2, a0, 3
-; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vmseq.vv v0, v16, v24
-; RV32-NEXT:    addi a0, a1, 128
-; RV32-NEXT:    vse64.v v8, (a0), v0.t
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vmseq.vv v0, v8, v16
+; RV32-NEXT:    addi a0, a1, 128
+; RV32-NEXT:    vse64.v v24, (a0), v0.t
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse64.v v8, (a1), v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 18
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: masked_store_v32i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a3, a3, 4
-; RV64-NEXT:    sub sp, sp, a3
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v8, (a2)
-; RV64-NEXT:    addi a2, a2, 128
+; RV64-NEXT:    addi a3, a2, 128
+; RV64-NEXT:    vle64.v v8, (a3)
 ; RV64-NEXT:    vle64.v v16, (a2)
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmseq.vi v0, v8, 0
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    addi a0, a0, 128
+; RV64-NEXT:    vmseq.vi v7, v8, 0
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vmseq.vi v8, v16, 0
-; RV64-NEXT:    vse64.v v24, (a1), v0.t
+; RV64-NEXT:    addi a0, a0, 128
+; RV64-NEXT:    vle64.v v24, (a0)
+; RV64-NEXT:    vmseq.vi v0, v16, 0
+; RV64-NEXT:    vse64.v v8, (a1), v0.t
 ; RV64-NEXT:    addi a0, a1, 128
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vse64.v v8, (a0), v0.t
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    vse64.v v24, (a0), v0.t
 ; RV64-NEXT:    ret
   %m = load <32 x i64>, ptr %m_ptr
   %mask = icmp eq <32 x i64> %m, zeroinitializer
@@ -538,42 +503,20 @@ declare void @llvm.masked.store.v64i16.p0(<64 x i16>, ptr, i32, <64 x i1>)
 define void @masked_store_v64i32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v64i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    sub sp, sp, a3
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a2)
-; CHECK-NEXT:    addi a2, a2, 128
+; CHECK-NEXT:    addi a3, a2, 128
+; CHECK-NEXT:    vle32.v v8, (a3)
 ; CHECK-NEXT:    vle32.v v16, (a2)
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vmseq.vi v7, v8, 0
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmseq.vi v8, v16, 0
-; CHECK-NEXT:    vse32.v v24, (a1), v0.t
+; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vle32.v v24, (a0)
+; CHECK-NEXT:    vmseq.vi v0, v16, 0
+; CHECK-NEXT:    vse32.v v8, (a1), v0.t
 ; CHECK-NEXT:    addi a0, a1, 128
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vse32.v v8, (a0), v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vse32.v v24, (a0), v0.t
 ; CHECK-NEXT:    ret
   %m = load <64 x i32>, ptr %m_ptr
   %mask = icmp eq <64 x i32> %m, zeroinitializer
@@ -604,42 +547,20 @@ declare void @llvm.masked.store.v128i8.p0(<128 x i8>, ptr, i32, <128 x i1>)
 define void @masked_store_v128i16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v128i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    sub sp, sp, a3
 ; CHECK-NEXT:    li a3, 64
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    addi a2, a2, 128
+; CHECK-NEXT:    addi a3, a2, 128
+; CHECK-NEXT:    vle16.v v8, (a3)
 ; CHECK-NEXT:    vle16.v v16, (a2)
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vle16.v v24, (a0)
-; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vmseq.vi v7, v8, 0
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmseq.vi v8, v16, 0
-; CHECK-NEXT:    vse16.v v24, (a1), v0.t
+; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vle16.v v24, (a0)
+; CHECK-NEXT:    vmseq.vi v0, v16, 0
+; CHECK-NEXT:    vse16.v v8, (a1), v0.t
 ; CHECK-NEXT:    addi a0, a1, 128
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vse16.v v8, (a0), v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vse16.v v24, (a0), v0.t
 ; CHECK-NEXT:    ret
   %m = load <128 x i16>, ptr %m_ptr
   %mask = icmp eq <128 x i16> %m, zeroinitializer
@@ -652,42 +573,20 @@ declare void @llvm.masked.store.v128i16.p0(<128 x i16>, ptr, i32, <128 x i1>)
 define void @masked_store_v256i8(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v256i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    sub sp, sp, a3
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a2)
-; CHECK-NEXT:    addi a2, a2, 128
+; CHECK-NEXT:    addi a3, a2, 128
+; CHECK-NEXT:    vle8.v v8, (a3)
 ; CHECK-NEXT:    vle8.v v16, (a2)
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vle8.v v24, (a0)
-; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vmseq.vi v7, v8, 0
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmseq.vi v8, v16, 0
-; CHECK-NEXT:    vse8.v v24, (a1), v0.t
+; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vle8.v v24, (a0)
+; CHECK-NEXT:    vmseq.vi v0, v16, 0
+; CHECK-NEXT:    vse8.v v8, (a1), v0.t
 ; CHECK-NEXT:    addi a0, a1, 128
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vse8.v v8, (a0), v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vse8.v v24, (a0), v0.t
 ; CHECK-NEXT:    ret
   %m = load <256 x i8>, ptr %m_ptr
   %mask = icmp eq <256 x i8> %m, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index 19f3d3ce19fa4c..cc8d230808aded 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -135,21 +135,21 @@ declare <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_nearbyint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI6_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -263,21 +263,21 @@ declare <8 x float> @llvm.vp.nearbyint.v8f32(<8 x float>, <8 x i1>, i32)
 define <8 x float> @vp_nearbyint_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.nearbyint.v8f32(<8 x float> %va, <8 x i1> %m, i32 %evl)
   ret <8 x float> %v
@@ -307,21 +307,21 @@ declare <16 x float> @llvm.vp.nearbyint.v16f32(<16 x float>, <16 x i1>, i32)
 define <16 x float> @vp_nearbyint_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.nearbyint.v16f32(<16 x float> %va, <16 x i1> %m, i32 %evl)
   ret <16 x float> %v
@@ -393,21 +393,21 @@ declare <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_nearbyint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl)
   ret <4 x double> %v
@@ -437,21 +437,21 @@ declare <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_nearbyint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl)
   ret <8 x double> %v
@@ -481,21 +481,21 @@ declare <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_nearbyint_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v15f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl)
   ret <15 x double> %v
@@ -525,21 +525,21 @@ declare <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_nearbyint_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl)
   ret <16 x double> %v
@@ -569,15 +569,6 @@ declare <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vslidedown.vi v7, v0, 2
@@ -589,42 +580,36 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    frflags a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    fsflags a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v16, v24, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; CHECK-NEXT:    vmv.v.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
index 9df160bf30f005..c83ce798a2f6f6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
@@ -179,24 +179,24 @@ define float @vpreduce_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vslidedown.vi v24, v0, 4
+; CHECK-NEXT:    vslidedown.vi v25, v0, 4
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB8_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:  .LBB8_2:
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmv.s.f v25, fa0
+; CHECK-NEXT:    vfmv.s.f v24, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vfredusum.vs v25, v8, v25, v0.t
+; CHECK-NEXT:    vfredusum.vs v24, v8, v24, v0.t
 ; CHECK-NEXT:    addi a1, a0, -32
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfredusum.vs v25, v16, v25, v0.t
-; CHECK-NEXT:    vfmv.f.s fa0, v25
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfredusum.vs v24, v16, v24, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v24
 ; CHECK-NEXT:    ret
   %r = call reassoc float @llvm.vp.reduce.fadd.v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32 %evl)
   ret float %r
@@ -207,24 +207,24 @@ define float @vpreduce_ord_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vslidedown.vi v24, v0, 4
+; CHECK-NEXT:    vslidedown.vi v25, v0, 4
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB9_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:  .LBB9_2:
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmv.s.f v25, fa0
+; CHECK-NEXT:    vfmv.s.f v24, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT:    vfredosum.vs v24, v8, v24, v0.t
 ; CHECK-NEXT:    addi a1, a0, -32
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfredosum.vs v25, v16, v25, v0.t
-; CHECK-NEXT:    vfmv.f.s fa0, v25
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfredosum.vs v24, v16, v24, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v24
 ; CHECK-NEXT:    ret
   %r = call float @llvm.vp.reduce.fadd.v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32 %evl)
   ret float %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 7dcfb247d37cbe..46c7f31a005d01 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -1780,41 +1780,26 @@ declare float @llvm.vector.reduce.fminimum.v64f32(<64 x float>)
 define float @vreduce_fminimum_v64f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v64f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vle32.v v24, (a1)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB109_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
-; CHECK-NEXT:    j .LBB109_3
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB109_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB109_3:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %v = load <64 x float>, ptr %x
   %red = call float @llvm.vector.reduce.fminimum.v64f32(<64 x float> %v)
@@ -1854,18 +1839,18 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vle32.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 384
+; CHECK-NEXT:    addi a2, a0, 128
+; CHECK-NEXT:    vle32.v v16, (a2)
 ; CHECK-NEXT:    vle32.v v8, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -1874,13 +1859,13 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1889,39 +1874,27 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v24, v24, v8
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB111_2
@@ -2118,40 +2091,25 @@ declare double @llvm.vector.reduce.fminimum.v32f64(<32 x double>)
 define double @vreduce_fminimum_v32f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB121_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, %hi(.LCPI121_0)
 ; CHECK-NEXT:    fld fa0, %lo(.LCPI121_0)(a0)
-; CHECK-NEXT:    j .LBB121_3
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB121_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB121_3:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %v = load <32 x double>, ptr %x
   %red = call double @llvm.vector.reduce.fminimum.v32f64(<32 x double> %v)
@@ -2194,13 +2152,13 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -2209,13 +2167,13 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -2224,39 +2182,27 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v24, v24, v8
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB123_2
@@ -2490,41 +2436,26 @@ declare float @llvm.vector.reduce.fmaximum.v64f32(<64 x float>)
 define float @vreduce_fmaximum_v64f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v64f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vle32.v v24, (a1)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB135_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
-; CHECK-NEXT:    j .LBB135_3
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB135_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB135_3:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %v = load <64 x float>, ptr %x
   %red = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> %v)
@@ -2564,18 +2495,18 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vle32.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 384
+; CHECK-NEXT:    addi a2, a0, 128
+; CHECK-NEXT:    vle32.v v16, (a2)
 ; CHECK-NEXT:    vle32.v v8, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -2584,13 +2515,13 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -2599,39 +2530,27 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v24, v24, v8
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB137_2
@@ -2828,40 +2747,25 @@ declare double @llvm.vector.reduce.fmaximum.v32f64(<32 x double>)
 define double @vreduce_fmaximum_v32f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB147_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, %hi(.LCPI147_0)
 ; CHECK-NEXT:    fld fa0, %lo(.LCPI147_0)(a0)
-; CHECK-NEXT:    j .LBB147_3
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB147_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB147_3:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %v = load <32 x double>, ptr %x
   %red = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> %v)
@@ -2904,13 +2808,13 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -2919,13 +2823,13 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -2934,39 +2838,27 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v24, v24, v8
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB149_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 2495178ea762d9..8f7a564ca69243 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -843,24 +843,24 @@ define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vslidedown.vi v24, v0, 4
+; CHECK-NEXT:    vslidedown.vi v25, v0, 4
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    bltu a1, a3, .LBB49_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:  .LBB49_2:
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v25, a0
+; CHECK-NEXT:    vmv.s.x v24, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT:    vredxor.vs v24, v8, v24, v0.t
 ; CHECK-NEXT:    addi a0, a1, -32
 ; CHECK-NEXT:    sltu a1, a1, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vredxor.vs v25, v16, v25, v0.t
-; CHECK-NEXT:    vmv.x.s a0, v25
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vredxor.vs v24, v16, v24, v0.t
+; CHECK-NEXT:    vmv.x.s a0, v24
 ; CHECK-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.xor.v64i32(i32 %s, <64 x i32> %v, <64 x i1> %m, i32 %evl)
   ret i32 %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
index 920d0d5fe7ba74..1e4f344c87637b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
@@ -123,19 +123,19 @@ declare <16 x half> @llvm.vp.rint.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_rint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI6_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x half> @llvm.vp.rint.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -239,19 +239,19 @@ declare <8 x float> @llvm.vp.rint.v8f32(<8 x float>, <8 x i1>, i32)
 define <8 x float> @vp_rint_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.rint.v8f32(<8 x float> %va, <8 x i1> %m, i32 %evl)
   ret <8 x float> %v
@@ -279,19 +279,19 @@ declare <16 x float> @llvm.vp.rint.v16f32(<16 x float>, <16 x i1>, i32)
 define <16 x float> @vp_rint_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.rint.v16f32(<16 x float> %va, <16 x i1> %m, i32 %evl)
   ret <16 x float> %v
@@ -357,19 +357,19 @@ declare <4 x double> @llvm.vp.rint.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_rint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.rint.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl)
   ret <4 x double> %v
@@ -397,19 +397,19 @@ declare <8 x double> @llvm.vp.rint.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_rint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.rint.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl)
   ret <8 x double> %v
@@ -437,19 +437,19 @@ declare <15 x double> @llvm.vp.rint.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_rint_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v15f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.rint.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl)
   ret <15 x double> %v
@@ -477,19 +477,19 @@ declare <16 x double> @llvm.vp.rint.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_rint_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.rint.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl)
   ret <16 x double> %v
@@ -517,21 +517,9 @@ declare <32 x double> @llvm.vp.rint.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    li a2, 16
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -540,44 +528,32 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.rint.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
index 6f045349423c7b..91feb05a030842 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
@@ -194,23 +194,23 @@ define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vp_round_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -261,42 +261,42 @@ declare <16 x half> @llvm.vp.round.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_v16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v10, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v13, v0
+; ZVFH-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
-; ZVFH-NEXT:    vmv1r.v v0, v10
-; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_round_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v17, v0
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -431,21 +431,21 @@ declare <8 x float> @llvm.vp.round.v8f32(<8 x float>, <8 x i1>, i32)
 define <8 x float> @vp_round_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.round.v8f32(<8 x float> %va, <8 x i1> %m, i32 %evl)
   ret <8 x float> %v
@@ -475,21 +475,21 @@ declare <16 x float> @llvm.vp.round.v16f32(<16 x float>, <16 x i1>, i32)
 define <16 x float> @vp_round_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.round.v16f32(<16 x float> %va, <16 x i1> %m, i32 %evl)
   ret <16 x float> %v
@@ -561,21 +561,21 @@ declare <4 x double> @llvm.vp.round.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_round_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.round.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl)
   ret <4 x double> %v
@@ -605,21 +605,21 @@ declare <8 x double> @llvm.vp.round.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_round_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.round.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl)
   ret <8 x double> %v
@@ -649,21 +649,21 @@ declare <15 x double> @llvm.vp.round.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_round_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v15f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.round.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl)
   ret <15 x double> %v
@@ -693,21 +693,21 @@ declare <16 x double> @llvm.vp.round.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_round_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.round.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl)
   ret <16 x double> %v
@@ -737,21 +737,9 @@ declare <32 x double> @llvm.vp.round.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    li a2, 16
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -760,48 +748,36 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a1, 4
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.round.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
index 738d7e37c50bde..89ba2d7a2c2f81 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
@@ -194,23 +194,23 @@ define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -261,42 +261,42 @@ declare <16 x half> @llvm.vp.roundeven.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_v16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v10, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v13, v0
+; ZVFH-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
-; ZVFH-NEXT:    vmv1r.v v0, v10
-; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v17, v0
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -431,21 +431,21 @@ declare <8 x float> @llvm.vp.roundeven.v8f32(<8 x float>, <8 x i1>, i32)
 define <8 x float> @vp_roundeven_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.roundeven.v8f32(<8 x float> %va, <8 x i1> %m, i32 %evl)
   ret <8 x float> %v
@@ -475,21 +475,21 @@ declare <16 x float> @llvm.vp.roundeven.v16f32(<16 x float>, <16 x i1>, i32)
 define <16 x float> @vp_roundeven_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.roundeven.v16f32(<16 x float> %va, <16 x i1> %m, i32 %evl)
   ret <16 x float> %v
@@ -561,21 +561,21 @@ declare <4 x double> @llvm.vp.roundeven.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_roundeven_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.roundeven.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl)
   ret <4 x double> %v
@@ -605,21 +605,21 @@ declare <8 x double> @llvm.vp.roundeven.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_roundeven_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.roundeven.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl)
   ret <8 x double> %v
@@ -649,21 +649,21 @@ declare <15 x double> @llvm.vp.roundeven.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_roundeven_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v15f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.roundeven.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl)
   ret <15 x double> %v
@@ -693,21 +693,21 @@ declare <16 x double> @llvm.vp.roundeven.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_roundeven_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.roundeven.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl)
   ret <16 x double> %v
@@ -737,21 +737,9 @@ declare <32 x double> @llvm.vp.roundeven.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    li a2, 16
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -760,48 +748,36 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a1, 0
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.roundeven.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
index 6f5b7875266b12..4faee5645f46c0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
@@ -194,23 +194,23 @@ define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -261,42 +261,42 @@ declare <16 x half> @llvm.vp.roundtozero.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_v16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v10, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v13, v0
+; ZVFH-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
-; ZVFH-NEXT:    vmv1r.v v0, v10
-; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v17, v0
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -431,21 +431,21 @@ declare <8 x float> @llvm.vp.roundtozero.v8f32(<8 x float>, <8 x i1>, i32)
 define <8 x float> @vp_roundtozero_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.roundtozero.v8f32(<8 x float> %va, <8 x i1> %m, i32 %evl)
   ret <8 x float> %v
@@ -475,21 +475,21 @@ declare <16 x float> @llvm.vp.roundtozero.v16f32(<16 x float>, <16 x i1>, i32)
 define <16 x float> @vp_roundtozero_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.roundtozero.v16f32(<16 x float> %va, <16 x i1> %m, i32 %evl)
   ret <16 x float> %v
@@ -561,21 +561,21 @@ declare <4 x double> @llvm.vp.roundtozero.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_roundtozero_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.roundtozero.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl)
   ret <4 x double> %v
@@ -605,21 +605,21 @@ declare <8 x double> @llvm.vp.roundtozero.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_roundtozero_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.roundtozero.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl)
   ret <8 x double> %v
@@ -649,21 +649,21 @@ declare <15 x double> @llvm.vp.roundtozero.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_roundtozero_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v15f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.roundtozero.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl)
   ret <15 x double> %v
@@ -693,21 +693,21 @@ declare <16 x double> @llvm.vp.roundtozero.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_roundtozero_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.roundtozero.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl)
   ret <16 x double> %v
@@ -737,21 +737,9 @@ declare <32 x double> @llvm.vp.roundtozero.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    li a2, 16
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -760,48 +748,36 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.roundtozero.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 6ba90b00fdba56..72f86ddb4c36cd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1163,7 +1163,9 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFH32-NEXT:    addi a0, sp, 16
 ; ZVFH32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFH32-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; ZVFH32-NEXT:    vslidedown.vi v7, v0, 8
+; ZVFH32-NEXT:    vmv1r.v v6, v0
+; ZVFH32-NEXT:    vslidedown.vi v25, v0, 8
+; ZVFH32-NEXT:    vmv.v.v v7, v25
 ; ZVFH32-NEXT:    mv a0, a2
 ; ZVFH32-NEXT:    bltu a2, a3, .LBB43_2
 ; ZVFH32-NEXT:  # %bb.1:
@@ -1172,21 +1174,23 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFH32-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH32-NEXT:    addi a0, sp, 16
 ; ZVFH32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH32-NEXT:    vmv1r.v v0, v6
 ; ZVFH32-NEXT:    vmfeq.vv v6, v8, v24, v0.t
 ; ZVFH32-NEXT:    addi a0, a2, -64
 ; ZVFH32-NEXT:    sltu a1, a2, a0
 ; ZVFH32-NEXT:    addi a1, a1, -1
 ; ZVFH32-NEXT:    and a0, a1, a0
 ; ZVFH32-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH32-NEXT:    vmv1r.v v0, v7
 ; ZVFH32-NEXT:    csrr a0, vlenb
 ; ZVFH32-NEXT:    slli a0, a0, 3
 ; ZVFH32-NEXT:    add a0, sp, a0
 ; ZVFH32-NEXT:    addi a0, a0, 16
 ; ZVFH32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFH32-NEXT:    vmfeq.vv v24, v16, v8, v0.t
+; ZVFH32-NEXT:    vmv1r.v v25, v7
+; ZVFH32-NEXT:    vmv1r.v v0, v7
+; ZVFH32-NEXT:    vmfeq.vv v25, v16, v8, v0.t
 ; ZVFH32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; ZVFH32-NEXT:    vslideup.vi v6, v24, 8
+; ZVFH32-NEXT:    vslideup.vi v6, v25, 8
 ; ZVFH32-NEXT:    vmv.v.v v0, v6
 ; ZVFH32-NEXT:    csrr a0, vlenb
 ; ZVFH32-NEXT:    slli a0, a0, 4
@@ -1199,49 +1203,61 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFH64-NEXT:    addi sp, sp, -16
 ; ZVFH64-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFH64-NEXT:    csrr a1, vlenb
-; ZVFH64-NEXT:    slli a1, a1, 4
+; ZVFH64-NEXT:    li a3, 24
+; ZVFH64-NEXT:    mul a1, a1, a3
 ; ZVFH64-NEXT:    sub sp, sp, a1
-; ZVFH64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFH64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFH64-NEXT:    addi a1, a0, 128
 ; ZVFH64-NEXT:    li a3, 64
 ; ZVFH64-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
 ; ZVFH64-NEXT:    vle16.v v24, (a1)
 ; ZVFH64-NEXT:    csrr a1, vlenb
-; ZVFH64-NEXT:    slli a1, a1, 3
+; ZVFH64-NEXT:    slli a1, a1, 4
 ; ZVFH64-NEXT:    add a1, sp, a1
 ; ZVFH64-NEXT:    addi a1, a1, 16
 ; ZVFH64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFH64-NEXT:    vle16.v v24, (a0)
-; ZVFH64-NEXT:    addi a0, sp, 16
+; ZVFH64-NEXT:    csrr a0, vlenb
+; ZVFH64-NEXT:    slli a0, a0, 3
+; ZVFH64-NEXT:    add a0, sp, a0
+; ZVFH64-NEXT:    addi a0, a0, 16
 ; ZVFH64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFH64-NEXT:    addi a0, sp, 16
+; ZVFH64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFH64-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; ZVFH64-NEXT:    mv a0, a2
-; ZVFH64-NEXT:    vslidedown.vi v7, v0, 8
+; ZVFH64-NEXT:    vslidedown.vi v17, v0, 8
 ; ZVFH64-NEXT:    bltu a2, a3, .LBB43_2
 ; ZVFH64-NEXT:  # %bb.1:
 ; ZVFH64-NEXT:    li a0, 64
 ; ZVFH64-NEXT:  .LBB43_2:
 ; ZVFH64-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH64-NEXT:    addi a0, sp, 16
+; ZVFH64-NEXT:    csrr a0, vlenb
+; ZVFH64-NEXT:    slli a0, a0, 3
+; ZVFH64-NEXT:    add a0, sp, a0
+; ZVFH64-NEXT:    addi a0, a0, 16
 ; ZVFH64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFH64-NEXT:    vmfeq.vv v6, v8, v24, v0.t
+; ZVFH64-NEXT:    vmfeq.vv v16, v8, v24, v0.t
 ; ZVFH64-NEXT:    addi a0, a2, -64
 ; ZVFH64-NEXT:    sltu a1, a2, a0
 ; ZVFH64-NEXT:    addi a1, a1, -1
 ; ZVFH64-NEXT:    and a0, a1, a0
 ; ZVFH64-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH64-NEXT:    vmv1r.v v0, v7
 ; ZVFH64-NEXT:    csrr a0, vlenb
-; ZVFH64-NEXT:    slli a0, a0, 3
+; ZVFH64-NEXT:    slli a0, a0, 4
 ; ZVFH64-NEXT:    add a0, sp, a0
 ; ZVFH64-NEXT:    addi a0, a0, 16
 ; ZVFH64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFH64-NEXT:    vmfeq.vv v24, v16, v8, v0.t
+; ZVFH64-NEXT:    addi a0, sp, 16
+; ZVFH64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH64-NEXT:    vmv1r.v v0, v17
+; ZVFH64-NEXT:    vmfeq.vv v17, v24, v8, v0.t
 ; ZVFH64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; ZVFH64-NEXT:    vslideup.vi v6, v24, 8
-; ZVFH64-NEXT:    vmv.v.v v0, v6
+; ZVFH64-NEXT:    vslideup.vi v16, v17, 8
+; ZVFH64-NEXT:    vmv.v.v v0, v16
 ; ZVFH64-NEXT:    csrr a0, vlenb
-; ZVFH64-NEXT:    slli a0, a0, 4
+; ZVFH64-NEXT:    li a1, 24
+; ZVFH64-NEXT:    mul a0, a0, a1
 ; ZVFH64-NEXT:    add sp, sp, a0
 ; ZVFH64-NEXT:    addi sp, sp, 16
 ; ZVFH64-NEXT:    ret
@@ -2906,50 +2922,75 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a3, 26
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 26 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a3, a1, 4
+; CHECK-NEXT:    add a1, a3, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs1r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    bltu a2, a1, .LBB87_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB87_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v6, v8, v24, v0.t
+; CHECK-NEXT:    vmfeq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v24, v16, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v17, v8, v24, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vslideup.vi v6, v24, 2
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vslideup.vi v16, v17, 2
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 26
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
index 981715bd2b998b..97b51817beba20 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
@@ -598,7 +598,7 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv1r.v v6, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
@@ -617,13 +617,13 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    and a2, a4, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v6, v16, v8, v0.t
+; CHECK-NEXT:    vmseq.vv v7, v16, v8, v0.t
 ; CHECK-NEXT:    bltu a3, a1, .LBB51_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:  .LBB51_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
@@ -633,7 +633,7 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmv1r.v v8, v6
+; CHECK-NEXT:    vmv1r.v v8, v7
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
@@ -646,7 +646,7 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
 define <256 x i1> @icmp_eq_vx_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: icmp_eq_vx_v256i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
@@ -655,16 +655,16 @@ define <256 x i1> @icmp_eq_vx_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 z
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a1, a4, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; CHECK-NEXT:    vmseq.vx v24, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB52_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:  .LBB52_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmv1r.v v8, v25
+; CHECK-NEXT:    vmv1r.v v8, v24
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <256 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
@@ -675,7 +675,7 @@ define <256 x i1> @icmp_eq_vx_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 z
 define <256 x i1> @icmp_eq_vx_swap_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: icmp_eq_vx_swap_v256i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
@@ -684,16 +684,16 @@ define <256 x i1> @icmp_eq_vx_swap_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m,
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a1, a4, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; CHECK-NEXT:    vmseq.vx v24, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB53_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:  .LBB53_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmv1r.v v8, v25
+; CHECK-NEXT:    vmv1r.v v8, v24
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <256 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
@@ -1264,7 +1264,9 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 4
+; RV32-NEXT:    vmv1r.v v6, v0
+; RV32-NEXT:    vslidedown.vi v25, v0, 4
+; RV32-NEXT:    vmv1r.v v7, v25
 ; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    bltu a2, a3, .LBB99_2
 ; RV32-NEXT:  # %bb.1:
@@ -1273,21 +1275,23 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv1r.v v0, v6
 ; RV32-NEXT:    vmseq.vv v6, v8, v24, v0.t
 ; RV32-NEXT:    addi a0, a2, -32
 ; RV32-NEXT:    sltu a1, a2, a0
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a0, a1, a0
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vmseq.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vmv1r.v v25, v7
+; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    vmseq.vv v25, v16, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vi v6, v24, 4
+; RV32-NEXT:    vslideup.vi v6, v25, 4
 ; RV32-NEXT:    vmv1r.v v0, v6
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -1300,49 +1304,61 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v24, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vle32.v v24, (a0)
-; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64-NEXT:    mv a0, a2
-; RV64-NEXT:    vslidedown.vi v7, v0, 4
+; RV64-NEXT:    vslidedown.vi v17, v0, 4
 ; RV64-NEXT:    bltu a2, a3, .LBB99_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a0, 32
 ; RV64-NEXT:  .LBB99_2:
 ; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vmseq.vv v6, v8, v24, v0.t
+; RV64-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; RV64-NEXT:    addi a0, a2, -32
 ; RV64-NEXT:    sltu a1, a2, a0
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vmseq.vv v24, v16, v8, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv1r.v v0, v17
+; RV64-NEXT:    vmseq.vv v17, v24, v8, v0.t
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vi v6, v24, 4
-; RV64-NEXT:    vmv1r.v v0, v6
+; RV64-NEXT:    vslideup.vi v16, v17, 4
+; RV64-NEXT:    vmv1r.v v0, v16
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -1355,24 +1371,24 @@ define <64 x i1> @icmp_eq_vx_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 ze
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vslidedown.vi v24, v0, 4
+; CHECK-NEXT:    vslidedown.vi v25, v0, 4
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    bltu a1, a3, .LBB100_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:  .LBB100_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v25, v8, a0, v0.t
+; CHECK-NEXT:    vmseq.vx v24, v8, a0, v0.t
 ; CHECK-NEXT:    addi a2, a1, -32
 ; CHECK-NEXT:    sltu a1, a1, a2
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmseq.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v25, v8, 4
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vslideup.vi v24, v8, 4
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <64 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <64 x i32> %elt.head, <64 x i32> poison, <64 x i32> zeroinitializer
@@ -1385,24 +1401,24 @@ define <64 x i1> @icmp_eq_vx_swap_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vslidedown.vi v24, v0, 4
+; CHECK-NEXT:    vslidedown.vi v25, v0, 4
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    bltu a1, a3, .LBB101_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:  .LBB101_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v25, v8, a0, v0.t
+; CHECK-NEXT:    vmseq.vx v24, v8, a0, v0.t
 ; CHECK-NEXT:    addi a2, a1, -32
 ; CHECK-NEXT:    sltu a1, a1, a2
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmseq.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v25, v8, 4
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vslideup.vi v24, v8, 4
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <64 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <64 x i32> %elt.head, <64 x i32> poison, <64 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
index 038fead011d899..ab24c67ed19142 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
@@ -20,9 +20,9 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: trn2.v8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -75,8 +75,8 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: trn2.v4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -99,9 +99,9 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: trn2.v8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -122,8 +122,8 @@ define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: trn2.v2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
+; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 1, i32 3>
@@ -146,8 +146,8 @@ define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: trn2.v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -168,8 +168,8 @@ define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 ; CHECK-LABEL: trn2.v2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
+; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 1, i32 3>
@@ -190,8 +190,8 @@ define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) {
 ; CHECK-LABEL: trn2.v2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
+; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 1, i32 3>
@@ -214,8 +214,8 @@ define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-LABEL: trn2.v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -236,8 +236,8 @@ define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) {
 ; CHECK-LABEL: trn2.v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
+; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 1, i32 3>
@@ -260,8 +260,8 @@ define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-LABEL: trn2.v4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -284,9 +284,9 @@ define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: trn2.v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 6a8d2008de74dd..c8ac148ad91c92 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -469,7 +469,6 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x
 ; CHECK-LABEL: strided_vpload_v32f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a4, 16
-; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    mv a3, a2
 ; CHECK-NEXT:    bltu a2, a4, .LBB40_2
 ; CHECK-NEXT:  # %bb.1:
@@ -482,7 +481,8 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a5
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v9, 2
+; CHECK-NEXT:    vslidedown.vi v8, v0, 2
+; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vlse64.v v16, (a4), a1, v0.t
@@ -525,7 +525,7 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV32-LABEL: strided_load_v33f64:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    li a5, 32
-; CHECK-RV32-NEXT:    vmv1r.v v8, v0
+; CHECK-RV32-NEXT:    vmv1r.v v16, v0
 ; CHECK-RV32-NEXT:    mv a3, a4
 ; CHECK-RV32-NEXT:    bltu a4, a5, .LBB42_2
 ; CHECK-RV32-NEXT:  # %bb.1:
@@ -543,9 +543,9 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV32-NEXT:    li a7, 16
 ; CHECK-RV32-NEXT:  .LBB42_4:
 ; CHECK-RV32-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v0, v8, 4
+; CHECK-RV32-NEXT:    vslidedown.vi v0, v16, 4
 ; CHECK-RV32-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v16, (a6), a2, v0.t
+; CHECK-RV32-NEXT:    vlse64.v v8, (a6), a2, v0.t
 ; CHECK-RV32-NEXT:    addi a6, a3, -16
 ; CHECK-RV32-NEXT:    sltu a3, a3, a6
 ; CHECK-RV32-NEXT:    addi a3, a3, -1
@@ -557,25 +557,27 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV32-NEXT:    mul a5, a4, a2
 ; CHECK-RV32-NEXT:    add a5, a1, a5
 ; CHECK-RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v0, v8, 2
+; CHECK-RV32-NEXT:    vslidedown.vi v9, v16, 2
+; CHECK-RV32-NEXT:    vmv1r.v v10, v16
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v24, (a5), a2, v0.t
+; CHECK-RV32-NEXT:    vmv1r.v v0, v9
+; CHECK-RV32-NEXT:    vlse64.v v16, (a5), a2, v0.t
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-RV32-NEXT:    vmv1r.v v0, v8
-; CHECK-RV32-NEXT:    vlse64.v v8, (a1), a2, v0.t
+; CHECK-RV32-NEXT:    vmv1r.v v0, v10
+; CHECK-RV32-NEXT:    vlse64.v v24, (a1), a2, v0.t
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-RV32-NEXT:    vse64.v v8, (a0)
+; CHECK-RV32-NEXT:    vse64.v v24, (a0)
 ; CHECK-RV32-NEXT:    addi a1, a0, 128
-; CHECK-RV32-NEXT:    vse64.v v24, (a1)
+; CHECK-RV32-NEXT:    vse64.v v16, (a1)
 ; CHECK-RV32-NEXT:    addi a0, a0, 256
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vse64.v v16, (a0)
+; CHECK-RV32-NEXT:    vse64.v v8, (a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: strided_load_v33f64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    li a5, 32
-; CHECK-RV64-NEXT:    vmv1r.v v8, v0
+; CHECK-RV64-NEXT:    vmv1r.v v16, v0
 ; CHECK-RV64-NEXT:    mv a4, a3
 ; CHECK-RV64-NEXT:    bltu a3, a5, .LBB42_2
 ; CHECK-RV64-NEXT:  # %bb.1:
@@ -593,9 +595,9 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV64-NEXT:    li a7, 16
 ; CHECK-RV64-NEXT:  .LBB42_4:
 ; CHECK-RV64-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v0, v8, 4
+; CHECK-RV64-NEXT:    vslidedown.vi v0, v16, 4
 ; CHECK-RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
-; CHECK-RV64-NEXT:    vlse64.v v16, (a6), a2, v0.t
+; CHECK-RV64-NEXT:    vlse64.v v8, (a6), a2, v0.t
 ; CHECK-RV64-NEXT:    addi a6, a4, -16
 ; CHECK-RV64-NEXT:    sltu a4, a4, a6
 ; CHECK-RV64-NEXT:    addi a4, a4, -1
@@ -607,19 +609,21 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV64-NEXT:    mul a5, a3, a2
 ; CHECK-RV64-NEXT:    add a5, a1, a5
 ; CHECK-RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v0, v8, 2
+; CHECK-RV64-NEXT:    vslidedown.vi v9, v16, 2
+; CHECK-RV64-NEXT:    vmv1r.v v10, v16
 ; CHECK-RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-RV64-NEXT:    vlse64.v v24, (a5), a2, v0.t
+; CHECK-RV64-NEXT:    vmv1r.v v0, v9
+; CHECK-RV64-NEXT:    vlse64.v v16, (a5), a2, v0.t
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-RV64-NEXT:    vmv1r.v v0, v8
-; CHECK-RV64-NEXT:    vlse64.v v8, (a1), a2, v0.t
+; CHECK-RV64-NEXT:    vmv1r.v v0, v10
+; CHECK-RV64-NEXT:    vlse64.v v24, (a1), a2, v0.t
 ; CHECK-RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-RV64-NEXT:    vse64.v v8, (a0)
+; CHECK-RV64-NEXT:    vse64.v v24, (a0)
 ; CHECK-RV64-NEXT:    addi a1, a0, 128
-; CHECK-RV64-NEXT:    vse64.v v24, (a1)
+; CHECK-RV64-NEXT:    vse64.v v16, (a1)
 ; CHECK-RV64-NEXT:    addi a0, a0, 256
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vse64.v v16, (a0)
+; CHECK-RV64-NEXT:    vse64.v v8, (a0)
 ; CHECK-RV64-NEXT:    ret
   %v = call <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 %evl)
   ret <33 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
index 4f16ce28bbb7e8..84412a7d075c24 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
@@ -222,283 +222,6 @@ define <2 x i32> @vtrunc_v2i32_v2i64_unmasked(<2 x i64> %a, i32 zeroext %vl) {
 declare <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64>, <128 x i1>, i32)
 
 define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: vtrunc_v128i32_v128i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 6
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 40
-; CHECK-NEXT:    mul a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v5, v0, 8
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v26, v0, 4
-; CHECK-NEXT:    addi a2, a1, 512
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a2)
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 48
-; CHECK-NEXT:    mul a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v27, v5, 4
-; CHECK-NEXT:    addi a2, a1, 640
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a2)
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    addi a2, a7, -64
-; CHECK-NEXT:    sltu a3, a7, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a4, a3, a2
-; CHECK-NEXT:    addi a2, a4, -32
-; CHECK-NEXT:    sltu a3, a4, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a3, a3, a2
-; CHECK-NEXT:    addi a2, a3, -16
-; CHECK-NEXT:    sltu a5, a3, a2
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a2, a5, a2
-; CHECK-NEXT:    vslidedown.vi v0, v27, 2
-; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a5, 24
-; CHECK-NEXT:    mul a2, a2, a5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    li a2, 16
-; CHECK-NEXT:    addi a5, a1, 128
-; CHECK-NEXT:    bltu a3, a2, .LBB16_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 16
-; CHECK-NEXT:  .LBB16_2:
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v28, v26, 2
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a5)
-; CHECK-NEXT:    addi a5, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli zero, a3, e32, m4, ta, ma
-; CHECK-NEXT:    li a3, 64
-; CHECK-NEXT:    vmv1r.v v0, v27
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 48
-; CHECK-NEXT:    mul a5, a5, a6
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 56
-; CHECK-NEXT:    mul a5, a5, a6
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    mv a6, a7
-; CHECK-NEXT:    bltu a7, a3, .LBB16_4
-; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    li a6, 64
-; CHECK-NEXT:  .LBB16_4:
-; CHECK-NEXT:    addi a5, a1, 384
-; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a1)
-; CHECK-NEXT:    csrr t0, vlenb
-; CHECK-NEXT:    slli t0, t0, 3
-; CHECK-NEXT:    add t0, sp, t0
-; CHECK-NEXT:    addi t0, t0, 16
-; CHECK-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi t0, a6, -32
-; CHECK-NEXT:    sltu a6, a6, t0
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a6, a6, t0
-; CHECK-NEXT:    addi t0, a6, -16
-; CHECK-NEXT:    sltu t1, a6, t0
-; CHECK-NEXT:    addi t1, t1, -1
-; CHECK-NEXT:    and t0, t1, t0
-; CHECK-NEXT:    vsetvli zero, t0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v28
-; CHECK-NEXT:    addi t0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (t0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    csrr t0, vlenb
-; CHECK-NEXT:    slli t0, t0, 4
-; CHECK-NEXT:    add t0, sp, t0
-; CHECK-NEXT:    addi t0, t0, 16
-; CHECK-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
-; CHECK-NEXT:    bltu a6, a2, .LBB16_6
-; CHECK-NEXT:  # %bb.5:
-; CHECK-NEXT:    li a6, 16
-; CHECK-NEXT:  .LBB16_6:
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v20, v5, 2
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a5)
-; CHECK-NEXT:    addi a1, a1, 256
-; CHECK-NEXT:    vsetvli zero, a6, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v26
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 3
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnsrl.wi v16, v24, 0, v0.t
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 48
-; CHECK-NEXT:    mul a5, a5, a6
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    bltu a4, a3, .LBB16_8
-; CHECK-NEXT:  # %bb.7:
-; CHECK-NEXT:    li a5, 32
-; CHECK-NEXT:  .LBB16_8:
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    addi a1, a5, -16
-; CHECK-NEXT:    sltu a5, a5, a1
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a1, a5, a1
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    bltu a4, a2, .LBB16_10
-; CHECK-NEXT:  # %bb.9:
-; CHECK-NEXT:    li a4, 16
-; CHECK-NEXT:  .LBB16_10:
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v6, v7, 2
-; CHECK-NEXT:    vsetvli zero, a4, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v5
-; CHECK-NEXT:    vnsrl.wi v8, v24, 0, v0.t
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    mv a1, a7
-; CHECK-NEXT:    bltu a7, a3, .LBB16_12
-; CHECK-NEXT:  # %bb.11:
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:  .LBB16_12:
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 24
-; CHECK-NEXT:    mul a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v24, v8
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 56
-; CHECK-NEXT:    mul a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v24, 16
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 56
-; CHECK-NEXT:    mul a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 4
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v24, v8
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 48
-; CHECK-NEXT:    mul a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v24, 16
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 48
-; CHECK-NEXT:    mul a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv4r.v v8, v0
-; CHECK-NEXT:    vslideup.vi v8, v16, 16
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 24
-; CHECK-NEXT:    mul a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a4, a1, -16
-; CHECK-NEXT:    sltu a1, a1, a4
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a4
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    bltu a7, a2, .LBB16_14
-; CHECK-NEXT:  # %bb.13:
-; CHECK-NEXT:    li a7, 16
-; CHECK-NEXT:  .LBB16_14:
-; CHECK-NEXT:    vsetvli zero, a7, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnsrl.wi v16, v24, 0, v0.t
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v16, v8, 16
-; CHECK-NEXT:    vse32.v v16, (a0)
-; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vse32.v v8, (a1)
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 48
-; CHECK-NEXT:    mul a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vse32.v v8, (a1)
-; CHECK-NEXT:    addi a0, a0, 384
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 56
-; CHECK-NEXT:    mul a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 6
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
   %v = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> %a, <128 x i1> %m, i32 %vl)
   ret <128 x i32> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll
index 70b547759938f4..920e0b66af7530 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll
@@ -163,12 +163,12 @@ define <8 x i1> @vaaddu_vv_v8i1_floor(<8 x i1> %x, <8 x i1> %y) {
 ; CHECK-LABEL: vaaddu_vv_v8i1_floor:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v10, v9, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    csrwi vxrm, 2
-; CHECK-NEXT:    vaaddu.vv v8, v10, v8
+; CHECK-NEXT:    vaaddu.vv v8, v9, v8
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
@@ -408,12 +408,12 @@ define <8 x i1> @vaaddu_vv_v8i1_ceil(<8 x i1> %x, <8 x i1> %y) {
 ; CHECK-LABEL: vaaddu_vv_v8i1_ceil:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v10, v9, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vaaddu.vv v8, v10, v8
+; CHECK-NEXT:    vaaddu.vv v8, v9, v8
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
index 2c62cbd583d00c..0df1702f7acbec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
@@ -363,7 +363,7 @@ declare <256 x i8> @llvm.vp.add.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
 define <256 x i8> @vadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vadd_vi_v258i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
@@ -378,7 +378,7 @@ define <256 x i8> @vadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    li a1, 128
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vadd.vi v8, v8, -1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.add.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
index 507cf5cc6b80cc..fac3ad109b3c6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
@@ -1139,17 +1139,17 @@ define <11 x i64> @vand_vv_v11i64_unmasked(<11 x i64> %va, <11 x i64> %b, i32 ze
 define <11 x i64> @vand_vx_v11i64(<11 x i64> %va, i64 %b, <11 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vand_vx_v11i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v16, v0
+; RV32-NEXT:    vmv1r.v v24, v0
 ; RV32-NEXT:    li a3, 32
 ; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    lui a3, 341
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vmv.s.x v0, a3
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vmerge.vxm v24, v24, a0, v0
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 341
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vmerge.vxm v16, v16, a0, v0
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v16
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vand_vx_v11i64:
@@ -1168,10 +1168,10 @@ define <11 x i64> @vand_vx_v11i64_unmasked(<11 x i64> %va, i64 %b, i32 zeroext %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a3, 32
 ; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    lui a3, 341
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vmv.s.x v0, a3
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 341
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    vmerge.vxm v16, v16, a0, v0
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
index 01b07b4081e6d9..10dc46a063ecca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
@@ -323,9 +323,9 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vfsgnj.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll
index 1a75c50f2b646f..55e1a1d4daf8b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll
@@ -2577,10 +2577,10 @@ define <16 x i1> @fcmp_ogt_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(<16 x half> %va, <16 x half> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <16 x i1> %1
@@ -2593,10 +2593,10 @@ define <16 x i1> @fcmp_ogt_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -2611,10 +2611,10 @@ define <16 x i1> @fcmp_ogt_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -2628,10 +2628,10 @@ define <16 x i1> @fcmp_oge_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(<16 x half> %va, <16 x half> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <16 x i1> %1
@@ -2644,10 +2644,10 @@ define <16 x i1> @fcmp_oge_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -2662,10 +2662,10 @@ define <16 x i1> @fcmp_oge_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -2679,10 +2679,10 @@ define <16 x i1> @fcmp_olt_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(<16 x half> %va, <16 x half> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <16 x i1> %1
@@ -2695,10 +2695,10 @@ define <16 x i1> @fcmp_olt_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -2713,10 +2713,10 @@ define <16 x i1> @fcmp_olt_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -2730,10 +2730,10 @@ define <16 x i1> @fcmp_ole_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(<16 x half> %va, <16 x half> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <16 x i1> %1
@@ -2746,10 +2746,10 @@ define <16 x i1> @fcmp_ole_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -2764,10 +2764,10 @@ define <16 x i1> @fcmp_ole_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -2781,12 +2781,12 @@ define <16 x i1> @fcmp_one_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(<16 x half> %va, <16 x half> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <16 x i1> %1
@@ -2799,12 +2799,12 @@ define <16 x i1> @fcmp_one_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -2819,12 +2819,12 @@ define <16 x i1> @fcmp_one_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -2880,12 +2880,12 @@ define <16 x i1> @fcmp_ueq_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(<16 x half> %va, <16 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <16 x i1> %1
@@ -2898,12 +2898,12 @@ define <16 x i1> @fcmp_ueq_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -2918,12 +2918,12 @@ define <16 x i1> @fcmp_ueq_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -2937,8 +2937,8 @@ define <16 x i1> @fcmp_ugt_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -2953,8 +2953,8 @@ define <16 x i1> @fcmp_ugt_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -2971,8 +2971,8 @@ define <16 x i1> @fcmp_ugt_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -2988,8 +2988,8 @@ define <16 x i1> @fcmp_uge_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3004,8 +3004,8 @@ define <16 x i1> @fcmp_uge_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -3022,8 +3022,8 @@ define <16 x i1> @fcmp_uge_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -3039,8 +3039,8 @@ define <16 x i1> @fcmp_ult_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vv v12, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3055,8 +3055,8 @@ define <16 x i1> @fcmp_ult_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -3073,8 +3073,8 @@ define <16 x i1> @fcmp_ult_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -3090,8 +3090,8 @@ define <16 x i1> @fcmp_ule_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3106,8 +3106,8 @@ define <16 x i1> @fcmp_ule_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -3124,8 +3124,8 @@ define <16 x i1> @fcmp_ule_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -3256,10 +3256,10 @@ define <32 x i1> @fcmp_ogt_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(<32 x half> %va, <32 x half> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <32 x i1> %1
@@ -3273,10 +3273,10 @@ define <32 x i1> @fcmp_ogt_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3292,10 +3292,10 @@ define <32 x i1> @fcmp_ogt_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3310,10 +3310,10 @@ define <32 x i1> @fcmp_oge_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(<32 x half> %va, <32 x half> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <32 x i1> %1
@@ -3327,10 +3327,10 @@ define <32 x i1> @fcmp_oge_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3346,10 +3346,10 @@ define <32 x i1> @fcmp_oge_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3364,10 +3364,10 @@ define <32 x i1> @fcmp_olt_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(<32 x half> %va, <32 x half> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <32 x i1> %1
@@ -3381,10 +3381,10 @@ define <32 x i1> @fcmp_olt_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3400,10 +3400,10 @@ define <32 x i1> @fcmp_olt_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3418,10 +3418,10 @@ define <32 x i1> @fcmp_ole_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vv v17, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(<32 x half> %va, <32 x half> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <32 x i1> %1
@@ -3435,10 +3435,10 @@ define <32 x i1> @fcmp_ole_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3454,10 +3454,10 @@ define <32 x i1> @fcmp_ole_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3472,12 +3472,12 @@ define <32 x i1> @fcmp_one_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %1 = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(<32 x half> %va, <32 x half> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <32 x i1> %1
@@ -3491,12 +3491,12 @@ define <32 x i1> @fcmp_one_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3512,12 +3512,12 @@ define <32 x i1> @fcmp_one_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3577,12 +3577,12 @@ define <32 x i1> @fcmp_ueq_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %1 = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(<32 x half> %va, <32 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <32 x i1> %1
@@ -3596,12 +3596,12 @@ define <32 x i1> @fcmp_ueq_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3617,12 +3617,12 @@ define <32 x i1> @fcmp_ueq_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3637,8 +3637,8 @@ define <32 x i1> @fcmp_ugt_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3654,8 +3654,8 @@ define <32 x i1> @fcmp_ugt_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3673,8 +3673,8 @@ define <32 x i1> @fcmp_ugt_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3691,8 +3691,8 @@ define <32 x i1> @fcmp_uge_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3708,8 +3708,8 @@ define <32 x i1> @fcmp_uge_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3727,8 +3727,8 @@ define <32 x i1> @fcmp_uge_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3745,8 +3745,8 @@ define <32 x i1> @fcmp_ult_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3762,8 +3762,8 @@ define <32 x i1> @fcmp_ult_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3781,8 +3781,8 @@ define <32 x i1> @fcmp_ult_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3799,8 +3799,8 @@ define <32 x i1> @fcmp_ule_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3816,8 +3816,8 @@ define <32 x i1> @fcmp_ule_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3835,8 +3835,8 @@ define <32 x i1> @fcmp_ule_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -5868,10 +5868,10 @@ define <8 x i1> @fcmp_ogt_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind st
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(<8 x float> %va, <8 x float> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <8 x i1> %1
@@ -5884,10 +5884,10 @@ define <8 x i1> @fcmp_ogt_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -5902,10 +5902,10 @@ define <8 x i1> @fcmp_ogt_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -5919,10 +5919,10 @@ define <8 x i1> @fcmp_oge_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind st
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(<8 x float> %va, <8 x float> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <8 x i1> %1
@@ -5935,10 +5935,10 @@ define <8 x i1> @fcmp_oge_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -5953,10 +5953,10 @@ define <8 x i1> @fcmp_oge_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -5970,10 +5970,10 @@ define <8 x i1> @fcmp_olt_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind st
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(<8 x float> %va, <8 x float> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <8 x i1> %1
@@ -5986,10 +5986,10 @@ define <8 x i1> @fcmp_olt_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -6004,10 +6004,10 @@ define <8 x i1> @fcmp_olt_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -6021,10 +6021,10 @@ define <8 x i1> @fcmp_ole_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind st
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(<8 x float> %va, <8 x float> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <8 x i1> %1
@@ -6037,10 +6037,10 @@ define <8 x i1> @fcmp_ole_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -6055,10 +6055,10 @@ define <8 x i1> @fcmp_ole_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -6072,12 +6072,12 @@ define <8 x i1> @fcmp_one_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind st
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(<8 x float> %va, <8 x float> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <8 x i1> %1
@@ -6090,12 +6090,12 @@ define <8 x i1> @fcmp_one_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -6110,12 +6110,12 @@ define <8 x i1> @fcmp_one_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -6171,12 +6171,12 @@ define <8 x i1> @fcmp_ueq_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind st
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(<8 x float> %va, <8 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <8 x i1> %1
@@ -6189,12 +6189,12 @@ define <8 x i1> @fcmp_ueq_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -6209,12 +6209,12 @@ define <8 x i1> @fcmp_ueq_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -6228,8 +6228,8 @@ define <8 x i1> @fcmp_ugt_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind st
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6244,8 +6244,8 @@ define <8 x i1> @fcmp_ugt_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -6262,8 +6262,8 @@ define <8 x i1> @fcmp_ugt_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -6279,8 +6279,8 @@ define <8 x i1> @fcmp_uge_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind st
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6295,8 +6295,8 @@ define <8 x i1> @fcmp_uge_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -6313,8 +6313,8 @@ define <8 x i1> @fcmp_uge_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -6330,8 +6330,8 @@ define <8 x i1> @fcmp_ult_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind st
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vv v12, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6346,8 +6346,8 @@ define <8 x i1> @fcmp_ult_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -6364,8 +6364,8 @@ define <8 x i1> @fcmp_ult_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -6381,8 +6381,8 @@ define <8 x i1> @fcmp_ule_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind st
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6397,8 +6397,8 @@ define <8 x i1> @fcmp_ule_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -6415,8 +6415,8 @@ define <8 x i1> @fcmp_ule_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -6543,10 +6543,10 @@ define <16 x i1> @fcmp_ogt_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwin
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(<16 x float> %va, <16 x float> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <16 x i1> %1
@@ -6559,10 +6559,10 @@ define <16 x i1> @fcmp_ogt_vf_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6577,10 +6577,10 @@ define <16 x i1> @fcmp_ogt_fv_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6594,10 +6594,10 @@ define <16 x i1> @fcmp_oge_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwin
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(<16 x float> %va, <16 x float> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <16 x i1> %1
@@ -6610,10 +6610,10 @@ define <16 x i1> @fcmp_oge_vf_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6628,10 +6628,10 @@ define <16 x i1> @fcmp_oge_fv_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6645,10 +6645,10 @@ define <16 x i1> @fcmp_olt_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwin
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(<16 x float> %va, <16 x float> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <16 x i1> %1
@@ -6661,10 +6661,10 @@ define <16 x i1> @fcmp_olt_vf_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6679,10 +6679,10 @@ define <16 x i1> @fcmp_olt_fv_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6696,10 +6696,10 @@ define <16 x i1> @fcmp_ole_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwin
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vv v17, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(<16 x float> %va, <16 x float> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <16 x i1> %1
@@ -6712,10 +6712,10 @@ define <16 x i1> @fcmp_ole_vf_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6730,10 +6730,10 @@ define <16 x i1> @fcmp_ole_fv_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6747,12 +6747,12 @@ define <16 x i1> @fcmp_one_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwin
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(<16 x float> %va, <16 x float> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <16 x i1> %1
@@ -6765,12 +6765,12 @@ define <16 x i1> @fcmp_one_vf_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6785,12 +6785,12 @@ define <16 x i1> @fcmp_one_fv_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6846,12 +6846,12 @@ define <16 x i1> @fcmp_ueq_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwin
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(<16 x float> %va, <16 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <16 x i1> %1
@@ -6864,12 +6864,12 @@ define <16 x i1> @fcmp_ueq_vf_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6884,12 +6884,12 @@ define <16 x i1> @fcmp_ueq_fv_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6903,8 +6903,8 @@ define <16 x i1> @fcmp_ugt_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwin
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -6919,8 +6919,8 @@ define <16 x i1> @fcmp_ugt_vf_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6937,8 +6937,8 @@ define <16 x i1> @fcmp_ugt_fv_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6954,8 +6954,8 @@ define <16 x i1> @fcmp_uge_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwin
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -6970,8 +6970,8 @@ define <16 x i1> @fcmp_uge_vf_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6988,8 +6988,8 @@ define <16 x i1> @fcmp_uge_fv_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -7005,8 +7005,8 @@ define <16 x i1> @fcmp_ult_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwin
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -7021,8 +7021,8 @@ define <16 x i1> @fcmp_ult_vf_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -7039,8 +7039,8 @@ define <16 x i1> @fcmp_ult_fv_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -7056,8 +7056,8 @@ define <16 x i1> @fcmp_ule_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwin
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -7072,8 +7072,8 @@ define <16 x i1> @fcmp_ule_vf_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -7090,8 +7090,8 @@ define <16 x i1> @fcmp_ule_fv_v16f32(<16 x float> %va, float %b) nounwind strict
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -8484,10 +8484,10 @@ define <4 x i1> @fcmp_ogt_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(<4 x double> %va, <4 x double> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <4 x i1> %1
@@ -8500,10 +8500,10 @@ define <4 x i1> @fcmp_ogt_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -8518,10 +8518,10 @@ define <4 x i1> @fcmp_ogt_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -8535,10 +8535,10 @@ define <4 x i1> @fcmp_oge_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(<4 x double> %va, <4 x double> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <4 x i1> %1
@@ -8551,10 +8551,10 @@ define <4 x i1> @fcmp_oge_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -8569,10 +8569,10 @@ define <4 x i1> @fcmp_oge_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -8586,10 +8586,10 @@ define <4 x i1> @fcmp_olt_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(<4 x double> %va, <4 x double> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <4 x i1> %1
@@ -8602,10 +8602,10 @@ define <4 x i1> @fcmp_olt_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -8620,10 +8620,10 @@ define <4 x i1> @fcmp_olt_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -8637,10 +8637,10 @@ define <4 x i1> @fcmp_ole_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(<4 x double> %va, <4 x double> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <4 x i1> %1
@@ -8653,10 +8653,10 @@ define <4 x i1> @fcmp_ole_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -8671,10 +8671,10 @@ define <4 x i1> @fcmp_ole_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -8688,12 +8688,12 @@ define <4 x i1> @fcmp_one_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %1 = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(<4 x double> %va, <4 x double> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <4 x i1> %1
@@ -8706,12 +8706,12 @@ define <4 x i1> @fcmp_one_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -8726,12 +8726,12 @@ define <4 x i1> @fcmp_one_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -8787,12 +8787,12 @@ define <4 x i1> @fcmp_ueq_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %1 = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(<4 x double> %va, <4 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <4 x i1> %1
@@ -8805,12 +8805,12 @@ define <4 x i1> @fcmp_ueq_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -8825,12 +8825,12 @@ define <4 x i1> @fcmp_ueq_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -8844,8 +8844,8 @@ define <4 x i1> @fcmp_ugt_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -8860,8 +8860,8 @@ define <4 x i1> @fcmp_ugt_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8878,8 +8878,8 @@ define <4 x i1> @fcmp_ugt_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8895,8 +8895,8 @@ define <4 x i1> @fcmp_uge_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -8911,8 +8911,8 @@ define <4 x i1> @fcmp_uge_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8929,8 +8929,8 @@ define <4 x i1> @fcmp_uge_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8946,8 +8946,8 @@ define <4 x i1> @fcmp_ult_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vv v12, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -8962,8 +8962,8 @@ define <4 x i1> @fcmp_ult_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8980,8 +8980,8 @@ define <4 x i1> @fcmp_ult_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8997,8 +8997,8 @@ define <4 x i1> @fcmp_ule_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9013,8 +9013,8 @@ define <4 x i1> @fcmp_ule_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -9031,8 +9031,8 @@ define <4 x i1> @fcmp_ule_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -9159,10 +9159,10 @@ define <8 x i1> @fcmp_ogt_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <8 x i1> %1
@@ -9175,10 +9175,10 @@ define <8 x i1> @fcmp_ogt_vf_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
@@ -9193,10 +9193,10 @@ define <8 x i1> @fcmp_ogt_fv_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
@@ -9210,10 +9210,10 @@ define <8 x i1> @fcmp_oge_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <8 x i1> %1
@@ -9226,10 +9226,10 @@ define <8 x i1> @fcmp_oge_vf_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
@@ -9244,10 +9244,10 @@ define <8 x i1> @fcmp_oge_fv_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
@@ -9261,10 +9261,10 @@ define <8 x i1> @fcmp_olt_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <8 x i1> %1
@@ -9277,10 +9277,10 @@ define <8 x i1> @fcmp_olt_vf_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
@@ -9295,10 +9295,10 @@ define <8 x i1> @fcmp_olt_fv_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
@@ -9312,10 +9312,10 @@ define <8 x i1> @fcmp_ole_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vv v17, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <8 x i1> %1
@@ -9328,10 +9328,10 @@ define <8 x i1> @fcmp_ole_vf_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
@@ -9346,10 +9346,10 @@ define <8 x i1> @fcmp_ole_fv_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
@@ -9363,12 +9363,12 @@ define <8 x i1> @fcmp_one_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <8 x i1> %1
@@ -9381,12 +9381,12 @@ define <8 x i1> @fcmp_one_vf_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
@@ -9401,12 +9401,12 @@ define <8 x i1> @fcmp_one_fv_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
@@ -9462,12 +9462,12 @@ define <8 x i1> @fcmp_ueq_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <8 x i1> %1
@@ -9480,12 +9480,12 @@ define <8 x i1> @fcmp_ueq_vf_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
@@ -9500,12 +9500,12 @@ define <8 x i1> @fcmp_ueq_fv_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
@@ -9519,8 +9519,8 @@ define <8 x i1> @fcmp_ugt_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9535,8 +9535,8 @@ define <8 x i1> @fcmp_ugt_vf_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9553,8 +9553,8 @@ define <8 x i1> @fcmp_ugt_fv_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9570,8 +9570,8 @@ define <8 x i1> @fcmp_uge_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9586,8 +9586,8 @@ define <8 x i1> @fcmp_uge_vf_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9604,8 +9604,8 @@ define <8 x i1> @fcmp_uge_fv_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9621,8 +9621,8 @@ define <8 x i1> @fcmp_ult_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9637,8 +9637,8 @@ define <8 x i1> @fcmp_ult_vf_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9655,8 +9655,8 @@ define <8 x i1> @fcmp_ult_fv_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9672,8 +9672,8 @@ define <8 x i1> @fcmp_ule_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9688,8 +9688,8 @@ define <8 x i1> @fcmp_ule_vf_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9706,8 +9706,8 @@ define <8 x i1> @fcmp_ule_fv_v8f64(<8 x double> %va, double %b) nounwind strictf
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index 0574773fb2fd93..5cd1771e7ac4a2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -890,7 +890,6 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
@@ -902,6 +901,7 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
index ffa88e28d7dc86..1050ca033a0aff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
@@ -415,9 +415,9 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
index 17f851e172f814..e6f408d7e49b97 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
@@ -415,9 +415,9 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
index 288efb0f1fc27c..1835f9238f8273 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
@@ -654,7 +654,6 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
@@ -666,6 +665,7 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
index 3db44e87109bd4..aa283c02b67c25 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
@@ -268,7 +268,7 @@ declare <256 x i8> @llvm.vp.smax.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
 define <256 x i8> @vmax_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmax_vx_v258i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
@@ -283,7 +283,7 @@ define <256 x i8> @vmax_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:  .LBB22_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <256 x i8> poison, i8 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
index c97c2232715f56..bb432be7d47e0d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
@@ -267,7 +267,7 @@ declare <256 x i8> @llvm.vp.umax.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
 define <256 x i8> @vmaxu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmaxu_vx_v258i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
@@ -282,7 +282,7 @@ define <256 x i8> @vmaxu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:  .LBB22_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmaxu.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <256 x i8> poison, i8 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
index eaa19110a2a28c..24e81df404c94d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
@@ -268,7 +268,7 @@ declare <256 x i8> @llvm.vp.smin.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
 define <256 x i8> @vmin_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmin_vx_v258i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
@@ -283,7 +283,7 @@ define <256 x i8> @vmin_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:  .LBB22_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <256 x i8> poison, i8 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
index 48175e5b905ba5..e26089e2648b22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
@@ -267,7 +267,7 @@ declare <256 x i8> @llvm.vp.umin.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
 define <256 x i8> @vminu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vminu_vx_v258i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
@@ -282,7 +282,7 @@ define <256 x i8> @vminu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:  .LBB22_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <256 x i8> poison, i8 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
index a13f1eed8efb1d..e65decfaa7d220 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
@@ -2444,7 +2444,6 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_v32f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v7, v0
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vnsrl.wi v24, v16, 0
 ; RV32-NEXT:    vnsrl.wi v16, v8, 0
@@ -2459,6 +2458,7 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v8, v0.t
@@ -2468,7 +2468,7 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB96_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
index 9ef89352e65e55..2124e103e0191c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
@@ -395,7 +395,7 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpload_v33f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a4, 32
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    mv a3, a2
 ; CHECK-NEXT:    bltu a2, a4, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
@@ -404,40 +404,40 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    addi a4, a3, -16
 ; CHECK-NEXT:    sltu a3, a3, a4
 ; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a3, a3, a4
+; CHECK-NEXT:    and a5, a3, a4
 ; CHECK-NEXT:    addi a4, a1, 128
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v8, 2
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a4), v0.t
 ; CHECK-NEXT:    addi a3, a2, -32
-; CHECK-NEXT:    sltu a4, a2, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a4, a4, a3
+; CHECK-NEXT:    sltu a6, a2, a3
+; CHECK-NEXT:    addi a6, a6, -1
+; CHECK-NEXT:    and a6, a6, a3
 ; CHECK-NEXT:    li a3, 16
-; CHECK-NEXT:    bltu a4, a3, .LBB32_4
+; CHECK-NEXT:    vslidedown.vi v0, v24, 2
+; CHECK-NEXT:    bltu a6, a3, .LBB32_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    li a4, 16
+; CHECK-NEXT:    li a6, 16
 ; CHECK-NEXT:  .LBB32_4:
-; CHECK-NEXT:    addi a5, a1, 256
+; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a4), v0.t
+; CHECK-NEXT:    addi a4, a1, 256
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v8, 4
-; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a5), v0.t
+; CHECK-NEXT:    vslidedown.vi v0, v24, 4
+; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a4), v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB32_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:  .LBB32_6:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vle64.v v8, (a1), v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vle64.v v24, (a1), v0.t
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    vse64.v v24, (a0)
 ; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vse64.v v16, (a1)
+; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    addi a0, a0, 256
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vse64.v v24, (a0)
+; CHECK-NEXT:    vse64.v v16, (a0)
 ; CHECK-NEXT:    ret
   %load = call <33 x double> @llvm.vp.load.v33f64.p0(ptr %ptr, <33 x i1> %m, i32 %evl)
   ret <33 x double> %load
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
index cd9a38d5167d5a..46bbb76eefe870 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
@@ -1851,28 +1851,28 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 10
-; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x0a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 10 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v24, (a1)
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v0, v24
+; RV64-NEXT:    vsext.vf2 v16, v24
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v24, 16
+; RV64-NEXT:    vslidedown.vi v8, v24, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v24
-; RV64-NEXT:    vsll.vi v16, v16, 3
+; RV64-NEXT:    vsext.vf2 v24, v8
+; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsll.vi v24, v0, 3
+; RV64-NEXT:    vsll.vi v24, v16, 3
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB81_2
 ; RV64-NEXT:  # %bb.1:
@@ -1880,8 +1880,8 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:  .LBB81_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsoxei64.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
@@ -1890,13 +1890,13 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 10
-; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -1938,28 +1938,28 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 10
-; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x0a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 10 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v24, (a1)
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v0, v24
+; RV64-NEXT:    vzext.vf2 v16, v24
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v24, 16
+; RV64-NEXT:    vslidedown.vi v8, v24, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v16, v24
-; RV64-NEXT:    vsll.vi v16, v16, 3
+; RV64-NEXT:    vzext.vf2 v24, v8
+; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsll.vi v24, v0, 3
+; RV64-NEXT:    vsll.vi v24, v16, 3
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB82_2
 ; RV64-NEXT:  # %bb.1:
@@ -1967,8 +1967,8 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:  .LBB82_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsoxei64.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
@@ -1977,13 +1977,13 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 10
-; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
index 291629de6dcfab..889ecd4d06c37b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -372,7 +372,7 @@ declare <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>,
 define <256 x i8> @vsadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vsadd_vi_v258i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
@@ -387,7 +387,7 @@ define <256 x i8> @vsadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    li a1, 128
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
index d38ee1148e894e..411bead6b8dec8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
@@ -368,7 +368,7 @@ declare <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>,
 define <256 x i8> @vsaddu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vsaddu_vi_v258i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
@@ -383,7 +383,7 @@ define <256 x i8> @vsaddu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    li a1, 128
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index d05f580ea7d222..fb04d5353227f2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -158,48 +158,38 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    sub sp, sp, a2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v9, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v7, v8
+; CHECK-NEXT:    vmv1r.v v6, v0
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v24, (a0)
 ; CHECK-NEXT:    addi a0, a1, 128
-; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a3, -128
 ; CHECK-NEXT:    sltu a4, a3, a0
 ; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    vle8.v v0, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle8.v v16, (a1)
 ; CHECK-NEXT:    and a0, a4, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vvm v24, v16, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; CHECK-NEXT:    bltu a3, a2, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:  .LBB11_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -221,38 +211,39 @@ define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v24, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a0, a1, 128
 ; CHECK-NEXT:    vle8.v v24, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vle8.v v16, (a1)
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v7, v8
+; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    vle8.v v16, (a1)
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
-; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -458,8 +449,8 @@ define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
index 7dcd4c41998279..a1596e2080b361 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
@@ -423,8 +423,8 @@ define void @vselect_vv_v8i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vv_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle32.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse32.v v8, (a3)
 ; CHECK-NEXT:    ret
@@ -440,8 +440,8 @@ define void @vselect_vx_v8i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vx_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vse32.v v8, (a3)
 ; CHECK-NEXT:    ret
@@ -458,8 +458,8 @@ define void @vselect_vi_v8i32(ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vi_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    vse32.v v8, (a2)
 ; CHECK-NEXT:    ret
@@ -474,8 +474,8 @@ define void @vselect_vv_v8f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vv_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle32.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse32.v v8, (a3)
 ; CHECK-NEXT:    ret
@@ -491,8 +491,8 @@ define void @vselect_vx_v8f32(float %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vx_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
 ; CHECK-NEXT:    vse32.v v8, (a2)
 ; CHECK-NEXT:    ret
@@ -509,8 +509,8 @@ define void @vselect_vfpzero_v8f32(ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vfpzero_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    vse32.v v8, (a2)
 ; CHECK-NEXT:    ret
@@ -525,8 +525,8 @@ define void @vselect_vv_v16i16(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vv_v16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
-; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse16.v v8, (a3)
 ; CHECK-NEXT:    ret
@@ -542,8 +542,8 @@ define void @vselect_vx_v16i16(i16 signext %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vx_v16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vse16.v v8, (a3)
 ; CHECK-NEXT:    ret
@@ -560,8 +560,8 @@ define void @vselect_vi_v16i16(ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vi_v16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vmerge.vim v8, v8, 4, v0
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    ret
@@ -577,8 +577,8 @@ define void @vselect_vv_v32f16(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:    vsetvli zero, a4, e16, m4, ta, mu
-; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse16.v v8, (a3)
 ; CHECK-NEXT:    ret
@@ -595,8 +595,8 @@ define void @vselect_vx_v32f16(half %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    ret
@@ -614,8 +614,8 @@ define void @vselect_vfpzero_v32f16(ptr %b, ptr %cc, ptr %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
index 2caa2ff41a7d93..5df2db29f2ca53 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -384,7 +384,7 @@ declare <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>,
 define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vssub_vi_v258i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
@@ -400,7 +400,7 @@ define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    li a1, 128
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 %evl)
@@ -437,11 +437,11 @@ define <256 x i8> @vssub_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vlm.v v25, (a0)
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 129)
@@ -455,11 +455,11 @@ define <256 x i8> @vssub_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vlm.v v25, (a0)
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 128)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
index 6313f31bc1a615..2ae7a965ab5132 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -379,7 +379,7 @@ declare <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>,
 define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vssubu_vi_v258i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
@@ -395,7 +395,7 @@ define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    li a1, 128
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 %evl)
@@ -432,11 +432,11 @@ define <256 x i8> @vssubu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vlm.v v25, (a0)
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 129)
@@ -450,11 +450,11 @@ define <256 x i8> @vssubu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vlm.v v25, (a0)
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 128)
diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
index 9c4706b2bda764..777e0055a2bb87 100644
--- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
@@ -135,21 +135,21 @@ declare <vscale x 8 x half> @llvm.vp.floor.nxv8f16(<vscale x 8 x half>, <vscale
 define <vscale x 8 x half> @vp_floor_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI6_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.floor.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -179,21 +179,21 @@ declare <vscale x 16 x half> @llvm.vp.floor.nxv16f16(<vscale x 16 x half>, <vsca
 define <vscale x 16 x half> @vp_floor_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI8_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI8_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.floor.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -223,21 +223,21 @@ declare <vscale x 32 x half> @llvm.vp.floor.nxv32f16(<vscale x 32 x half>, <vsca
 define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI10_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI10_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.floor.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -351,21 +351,21 @@ declare <vscale x 4 x float> @llvm.vp.floor.nxv4f32(<vscale x 4 x float>, <vscal
 define <vscale x 4 x float> @vp_floor_nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.floor.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x float> %v
@@ -395,21 +395,21 @@ declare <vscale x 8 x float> @llvm.vp.floor.nxv8f32(<vscale x 8 x float>, <vscal
 define <vscale x 8 x float> @vp_floor_nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.floor.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x float> %v
@@ -439,21 +439,21 @@ declare <vscale x 16 x float> @llvm.vp.floor.nxv16f32(<vscale x 16 x float>, <vs
 define <vscale x 16 x float> @vp_floor_nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.floor.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x float> %v
@@ -525,21 +525,21 @@ declare <vscale x 2 x double> @llvm.vp.floor.nxv2f64(<vscale x 2 x double>, <vsc
 define <vscale x 2 x double> @vp_floor_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.floor.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x double> %v
@@ -569,21 +569,21 @@ declare <vscale x 4 x double> @llvm.vp.floor.nxv4f64(<vscale x 4 x double>, <vsc
 define <vscale x 4 x double> @vp_floor_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.floor.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x double> %v
@@ -613,21 +613,21 @@ declare <vscale x 7 x double> @llvm.vp.floor.nxv7f64(<vscale x 7 x double>, <vsc
 define <vscale x 7 x double> @vp_floor_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv7f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI28_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI28_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.floor.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x double> %v
@@ -657,21 +657,21 @@ declare <vscale x 8 x double> @llvm.vp.floor.nxv8f64(<vscale x 8 x double>, <vsc
 define <vscale x 8 x double> @vp_floor_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI30_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI30_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.floor.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x double> %v
@@ -702,19 +702,11 @@ declare <vscale x 16 x double> @llvm.vp.floor.nxv16f64(<vscale x 16 x double>, <
 define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v25, v0, a2
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -722,51 +714,36 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    lui a3, %hi(.LCPI32_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI32_0)(a3)
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a2, 2
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.floor.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
index 386f23f68c357e..82d112335a27d7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
@@ -23,11 +23,10 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    vfmax.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmax_nxv1f16_vv:
@@ -37,11 +36,10 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -56,11 +54,10 @@ define <vscale x 2 x half> @vfmax_nxv2f16_vv(<vscale x 2 x half> %a, <vscale x 2
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    vfmax.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmax_nxv2f16_vv:
@@ -70,11 +67,10 @@ define <vscale x 2 x half> @vfmax_nxv2f16_vv(<vscale x 2 x half> %a, <vscale x 2
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -89,11 +85,10 @@ define <vscale x 4 x half> @vfmax_nxv4f16_vv(<vscale x 4 x half> %a, <vscale x 4
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    vfmax.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmax_nxv4f16_vv:
@@ -103,11 +98,10 @@ define <vscale x 4 x half> @vfmax_nxv4f16_vv(<vscale x 4 x half> %a, <vscale x 4
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmerge.vvm v14, v12, v10, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
-; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v14
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
+; ZVFHMIN-NEXT:    vfmax.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -122,11 +116,10 @@ define <vscale x 8 x half> @vfmax_nxv8f16_vv(<vscale x 8 x half> %a, <vscale x 8
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
-; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT:    vfmax.vv v8, v8, v14
+; ZVFH-NEXT:    vfmax.vv v8, v8, v12
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmax_nxv8f16_vv:
@@ -136,11 +129,10 @@ define <vscale x 8 x half> @vfmax_nxv8f16_vv(<vscale x 8 x half> %a, <vscale x 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v20, v16, v12, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
-; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v20
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v12, v12, v16, v0
+; ZVFHMIN-NEXT:    vfmax.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -155,40 +147,25 @@ define <vscale x 16 x half> @vfmax_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v16, v12, v12
-; ZVFH-NEXT:    vmerge.vvm v20, v8, v12, v0
-; ZVFH-NEXT:    vmv1r.v v0, v16
+; ZVFH-NEXT:    vmerge.vvm v16, v8, v12, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFH-NEXT:    vmerge.vvm v8, v12, v8, v0
-; ZVFH-NEXT:    vfmax.vv v8, v8, v20
+; ZVFH-NEXT:    vfmax.vv v8, v8, v16
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmax_nxv16f16_vv:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.maximum.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b)
   ret <vscale x 16 x half> %v
@@ -201,9 +178,8 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
-; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v24
 ; ZVFH-NEXT:    ret
@@ -229,29 +205,28 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
-; ZVFHMIN-NEXT:    vfmax.vv v24, v8, v16
+; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v8, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
@@ -278,11 +253,10 @@ define <vscale x 1 x float> @vfmax_nxv1f32_vv(<vscale x 1 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v11
+; CHECK-NEXT:    vfmax.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x float> @llvm.maximum.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b)
   ret <vscale x 1 x float> %v
@@ -295,11 +269,10 @@ define <vscale x 2 x float> @vfmax_nxv2f32_vv(<vscale x 2 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v11
+; CHECK-NEXT:    vfmax.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x float> @llvm.maximum.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
   ret <vscale x 2 x float> %v
@@ -312,11 +285,10 @@ define <vscale x 4 x float> @vfmax_nxv4f32_vv(<vscale x 4 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
-; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v14
+; CHECK-NEXT:    vfmax.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.maximum.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
   ret <vscale x 4 x float> %v
@@ -329,11 +301,10 @@ define <vscale x 8 x float> @vfmax_nxv8f32_vv(<vscale x 8 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
-; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v20
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.maximum.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b)
   ret <vscale x 8 x float> %v
@@ -346,9 +317,8 @@ define <vscale x 16 x float> @vfmax_nxv16f32_vv(<vscale x 16 x float> %a, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v24
 ; CHECK-NEXT:    ret
@@ -363,11 +333,10 @@ define <vscale x 1 x double> @vfmax_nxv1f64_vv(<vscale x 1 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v11
+; CHECK-NEXT:    vfmax.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x double> @llvm.maximum.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b)
   ret <vscale x 1 x double> %v
@@ -380,11 +349,10 @@ define <vscale x 2 x double> @vfmax_nxv2f64_vv(<vscale x 2 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
-; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v14
+; CHECK-NEXT:    vfmax.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.maximum.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
   ret <vscale x 2 x double> %v
@@ -397,11 +365,10 @@ define <vscale x 4 x double> @vfmax_nxv4f64_vv(<vscale x 4 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
-; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v20
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.maximum.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b)
   ret <vscale x 4 x double> %v
@@ -414,9 +381,8 @@ define <vscale x 8 x double> @vfmax_nxv8f64_vv(<vscale x 8 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v24
 ; CHECK-NEXT:    ret
@@ -463,16 +429,15 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv_nnana(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -504,12 +469,11 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv_nnanb(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v9, v11, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v9, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v9, v8, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
index 48baa12aa2e59b..69ad7b4d1bb828 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
@@ -23,11 +23,10 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    vfmin.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmin_nxv1f16_vv:
@@ -37,11 +36,10 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -56,11 +54,10 @@ define <vscale x 2 x half> @vfmin_nxv2f16_vv(<vscale x 2 x half> %a, <vscale x 2
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    vfmin.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmin_nxv2f16_vv:
@@ -70,11 +67,10 @@ define <vscale x 2 x half> @vfmin_nxv2f16_vv(<vscale x 2 x half> %a, <vscale x 2
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -89,11 +85,10 @@ define <vscale x 4 x half> @vfmin_nxv4f16_vv(<vscale x 4 x half> %a, <vscale x 4
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
-; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    vfmin.vv v8, v8, v10
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmin_nxv4f16_vv:
@@ -103,11 +98,10 @@ define <vscale x 4 x half> @vfmin_nxv4f16_vv(<vscale x 4 x half> %a, <vscale x 4
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmerge.vvm v14, v12, v10, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
-; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v14
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
+; ZVFHMIN-NEXT:    vfmin.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -122,11 +116,10 @@ define <vscale x 8 x half> @vfmin_nxv8f16_vv(<vscale x 8 x half> %a, <vscale x 8
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
-; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT:    vfmin.vv v8, v8, v14
+; ZVFH-NEXT:    vfmin.vv v8, v8, v12
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmin_nxv8f16_vv:
@@ -136,11 +129,10 @@ define <vscale x 8 x half> @vfmin_nxv8f16_vv(<vscale x 8 x half> %a, <vscale x 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v20, v16, v12, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
-; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v20
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v12, v12, v16, v0
+; ZVFHMIN-NEXT:    vfmin.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -155,40 +147,25 @@ define <vscale x 16 x half> @vfmin_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v16, v12, v12
-; ZVFH-NEXT:    vmerge.vvm v20, v8, v12, v0
-; ZVFH-NEXT:    vmv1r.v v0, v16
+; ZVFH-NEXT:    vmerge.vvm v16, v8, v12, v0
+; ZVFH-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFH-NEXT:    vmerge.vvm v8, v12, v8, v0
-; ZVFH-NEXT:    vfmin.vv v8, v8, v20
+; ZVFH-NEXT:    vfmin.vv v8, v8, v16
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vfmin_nxv16f16_vv:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.minimum.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b)
   ret <vscale x 16 x half> %v
@@ -201,9 +178,8 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
-; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v24
 ; ZVFH-NEXT:    ret
@@ -229,29 +205,28 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
-; ZVFHMIN-NEXT:    vfmin.vv v24, v8, v16
+; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v8, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
@@ -278,11 +253,10 @@ define <vscale x 1 x float> @vfmin_nxv1f32_vv(<vscale x 1 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v11
+; CHECK-NEXT:    vfmin.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x float> @llvm.minimum.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b)
   ret <vscale x 1 x float> %v
@@ -295,11 +269,10 @@ define <vscale x 2 x float> @vfmin_nxv2f32_vv(<vscale x 2 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v11
+; CHECK-NEXT:    vfmin.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x float> @llvm.minimum.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
   ret <vscale x 2 x float> %v
@@ -312,11 +285,10 @@ define <vscale x 4 x float> @vfmin_nxv4f32_vv(<vscale x 4 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
-; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v14
+; CHECK-NEXT:    vfmin.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.minimum.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
   ret <vscale x 4 x float> %v
@@ -329,11 +301,10 @@ define <vscale x 8 x float> @vfmin_nxv8f32_vv(<vscale x 8 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
-; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v20
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.minimum.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b)
   ret <vscale x 8 x float> %v
@@ -346,9 +317,8 @@ define <vscale x 16 x float> @vfmin_nxv16f32_vv(<vscale x 16 x float> %a, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v24
 ; CHECK-NEXT:    ret
@@ -363,11 +333,10 @@ define <vscale x 1 x double> @vfmin_nxv1f64_vv(<vscale x 1 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v11
+; CHECK-NEXT:    vfmin.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x double> @llvm.minimum.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b)
   ret <vscale x 1 x double> %v
@@ -380,11 +349,10 @@ define <vscale x 2 x double> @vfmin_nxv2f64_vv(<vscale x 2 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
-; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v14
+; CHECK-NEXT:    vfmin.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.minimum.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
   ret <vscale x 2 x double> %v
@@ -397,11 +365,10 @@ define <vscale x 4 x double> @vfmin_nxv4f64_vv(<vscale x 4 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
-; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v20
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.minimum.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b)
   ret <vscale x 4 x double> %v
@@ -414,9 +381,8 @@ define <vscale x 8 x double> @vfmin_nxv8f64_vv(<vscale x 8 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v24
 ; CHECK-NEXT:    ret
@@ -463,16 +429,15 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv_nnana(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -504,12 +469,11 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv_nnanb(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v9, v11, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v9, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v9, v8, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
index f90237b8d7e95d..4ea9657f9b17cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
@@ -10,9 +10,9 @@ define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -33,9 +33,9 @@ define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -56,9 +56,9 @@ define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -79,9 +79,9 @@ define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -102,9 +102,9 @@ define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %v) strictf
 ; CHECK-LABEL: nearbyint_nxv16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -125,9 +125,9 @@ define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %v) strictf
 ; CHECK-LABEL: nearbyint_nxv32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
@@ -263,9 +263,9 @@ define <vscale x 1 x double> @nearbyint_nxv1f64(<vscale x 1 x double> %v) strict
 ; CHECK-LABEL: nearbyint_nxv1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -286,9 +286,9 @@ define <vscale x 2 x double> @nearbyint_nxv2f64(<vscale x 2 x double> %v) strict
 ; CHECK-LABEL: nearbyint_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -309,9 +309,9 @@ define <vscale x 4 x double> @nearbyint_nxv4f64(<vscale x 4 x double> %v) strict
 ; CHECK-LABEL: nearbyint_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -332,9 +332,9 @@ define <vscale x 8 x double> @nearbyint_nxv8f64(<vscale x 8 x double> %v) strict
 ; CHECK-LABEL: nearbyint_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-costrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-costrained-sdnode.ll
index 3276f481f30ea5..b3e5d46c201a7d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fround-costrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fround-costrained-sdnode.ll
@@ -10,9 +10,9 @@ define <vscale x 1 x half> @round_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -32,9 +32,9 @@ define <vscale x 2 x half> @round_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -54,9 +54,9 @@ define <vscale x 4 x half> @round_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -76,9 +76,9 @@ define <vscale x 8 x half> @round_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -98,9 +98,9 @@ define <vscale x 16 x half> @round_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -120,9 +120,9 @@ define <vscale x 32 x half> @round_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
@@ -252,9 +252,9 @@ define <vscale x 1 x double> @round_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: round_nxv1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -274,9 +274,9 @@ define <vscale x 2 x double> @round_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: round_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -296,9 +296,9 @@ define <vscale x 4 x double> @round_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: round_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -318,9 +318,9 @@ define <vscale x 8 x double> @round_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: round_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
index 4ebfcccbaaa6e6..a3bb9cb0ec7d50 100644
--- a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
@@ -10,9 +10,9 @@ define <vscale x 1 x half> @roundeven_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -32,9 +32,9 @@ define <vscale x 2 x half> @roundeven_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -54,9 +54,9 @@ define <vscale x 4 x half> @roundeven_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -76,9 +76,9 @@ define <vscale x 8 x half> @roundeven_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -98,9 +98,9 @@ define <vscale x 16 x half> @roundeven_nxv16f16(<vscale x 16 x half> %x) strictf
 ; CHECK-LABEL: roundeven_nxv16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -120,9 +120,9 @@ define <vscale x 32 x half> @roundeven_nxv32f16(<vscale x 32 x half> %x) strictf
 ; CHECK-LABEL: roundeven_nxv32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
@@ -252,9 +252,9 @@ define <vscale x 1 x double> @roundeven_nxv1f64(<vscale x 1 x double> %x) strict
 ; CHECK-LABEL: roundeven_nxv1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -274,9 +274,9 @@ define <vscale x 2 x double> @roundeven_nxv2f64(<vscale x 2 x double> %x) strict
 ; CHECK-LABEL: roundeven_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -296,9 +296,9 @@ define <vscale x 4 x double> @roundeven_nxv4f64(<vscale x 4 x double> %x) strict
 ; CHECK-LABEL: roundeven_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -318,9 +318,9 @@ define <vscale x 8 x double> @roundeven_nxv8f64(<vscale x 8 x double> %x) strict
 ; CHECK-LABEL: roundeven_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index f9f085dcc16143..816b63edc87a85 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -251,10 +251,10 @@ define <vscale x 64 x i8> @fshl_v64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8>
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v16, 1, v0.t
-; CHECK-NEXT:    vnot.v v8, v24, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 7, v0.t
-; CHECK-NEXT:    vsrl.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; CHECK-NEXT:    vnot.v v16, v24, v0.t
+; CHECK-NEXT:    vand.vi v16, v16, 7, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vi v16, v24, 7, v0.t
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsll.vv v16, v24, v16, v0.t
@@ -483,10 +483,10 @@ define <vscale x 32 x i16> @fshl_v32i16(<vscale x 32 x i16> %a, <vscale x 32 x i
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v16, 1, v0.t
-; CHECK-NEXT:    vnot.v v8, v24, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
-; CHECK-NEXT:    vsrl.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; CHECK-NEXT:    vnot.v v16, v24, v0.t
+; CHECK-NEXT:    vand.vi v16, v16, 15, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vi v16, v24, 15, v0.t
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsll.vv v16, v24, v16, v0.t
@@ -659,14 +659,15 @@ define <vscale x 16 x i32> @fshr_v16i32(<vscale x 16 x i32> %a, <vscale x 16 x i
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
-; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vnot.v v8, v24, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vnot.v v24, v8, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vi v24, v24, 1, v0.t
-; CHECK-NEXT:    vsll.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -684,26 +685,37 @@ define <vscale x 16 x i32> @fshl_v16i32(<vscale x 16 x i32> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
-; CHECK-NEXT:    vsll.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vnot.v v16, v24, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
+; CHECK-NEXT:    vsll.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vnot.v v24, v8, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vi v24, v24, 1, v0.t
-; CHECK-NEXT:    vsrl.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -834,14 +846,15 @@ define <vscale x 7 x i64> @fshr_v7i64(<vscale x 7 x i64> %a, <vscale x 7 x i64>
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
-; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vnot.v v8, v24, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vnot.v v24, v8, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vi v24, v24, 1, v0.t
-; CHECK-NEXT:    vsll.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -859,26 +872,37 @@ define <vscale x 7 x i64> @fshl_v7i64(<vscale x 7 x i64> %a, <vscale x 7 x i64>
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
-; CHECK-NEXT:    vsll.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vnot.v v16, v24, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
+; CHECK-NEXT:    vsll.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vnot.v v24, v8, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vi v24, v24, 1, v0.t
-; CHECK-NEXT:    vsrl.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -901,14 +925,15 @@ define <vscale x 8 x i64> @fshr_v8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64>
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
-; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vnot.v v8, v24, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vnot.v v24, v8, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vi v24, v24, 1, v0.t
-; CHECK-NEXT:    vsll.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -926,399 +951,52 @@ define <vscale x 8 x i64> @fshl_v8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64>
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v16, v8
-; CHECK-NEXT:    li a0, 63
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
-; CHECK-NEXT:    vsll.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vnot.v v16, v24, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vi v24, v24, 1, v0.t
-; CHECK-NEXT:    vsrl.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.vp.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, <vscale x 8 x i64> %c, <vscale x 8 x i1> %m, i32 %evl)
-  ret <vscale x 8 x i64> %res
-}
-
-declare <vscale x 16 x i64> @llvm.vp.fshr.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, <vscale x 16 x i64>, <vscale x 16 x i1>, i32)
-define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i64> %b, <vscale x 16 x i64> %c, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: fshr_v16i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 56
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a1, a3, 3
-; CHECK-NEXT:    add a5, a0, a1
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 3
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    srli a5, a3, 3
-; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a5
-; CHECK-NEXT:    add a5, a2, a1
-; CHECK-NEXT:    sub a1, a4, a3
-; CHECK-NEXT:    sltu a6, a4, a1
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a6, a6, a1
-; CHECK-NEXT:    li a1, 63
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a7, 40
-; CHECK-NEXT:    mul a5, a5, a7
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
+; CHECK-NEXT:    vsll.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vnot.v v24, v8, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    bltu a4, a3, .LBB46_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:  .LBB46_2:
-; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v16, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 56
-; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x i64> @llvm.vp.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, <vscale x 8 x i64> %c, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %res
+}
+
+declare <vscale x 16 x i64> @llvm.vp.fshr.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, <vscale x 16 x i64>, <vscale x 16 x i1>, i32)
+define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i64> %b, <vscale x 16 x i64> %c, <vscale x 16 x i1> %m, i32 zeroext %evl) {
   %res = call <vscale x 16 x i64> @llvm.vp.fshr.nxv16i64(<vscale x 16 x i64> %a, <vscale x 16 x i64> %b, <vscale x 16 x i64> %c, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i64> %res
 }
 
 declare <vscale x 16 x i64> @llvm.vp.fshl.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, <vscale x 16 x i64>, <vscale x 16 x i1>, i32)
 define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i64> %b, <vscale x 16 x i64> %c, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: fshl_v16i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 56
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a1, a3, 3
-; CHECK-NEXT:    add a5, a0, a1
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    srli a5, a3, 3
-; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a5
-; CHECK-NEXT:    add a5, a2, a1
-; CHECK-NEXT:    sub a1, a4, a3
-; CHECK-NEXT:    sltu a6, a4, a1
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a6, a6, a1
-; CHECK-NEXT:    li a1, 63
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a7, 40
-; CHECK-NEXT:    mul a5, a5, a7
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a5, 24
-; CHECK-NEXT:    mul a0, a0, a5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    bltu a4, a3, .LBB47_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:  .LBB47_2:
-; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v16, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 56
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i64> @llvm.vp.fshl.nxv16i64(<vscale x 16 x i64> %a, <vscale x 16 x i64> %b, <vscale x 16 x i64> %c, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i64> %res
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
index 3665669d83a3d4..71628f029c5215 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
@@ -8,9 +8,9 @@ define <vscale x 1 x half> @trunc_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -28,9 +28,9 @@ define <vscale x 2 x half> @trunc_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -48,9 +48,9 @@ define <vscale x 4 x half> @trunc_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -68,9 +68,9 @@ define <vscale x 8 x half> @trunc_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -88,9 +88,9 @@ define <vscale x 16 x half> @trunc_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -108,9 +108,9 @@ define <vscale x 32 x half> @trunc_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
@@ -228,9 +228,9 @@ define <vscale x 1 x double> @trunc_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
@@ -248,9 +248,9 @@ define <vscale x 2 x double> @trunc_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
@@ -268,9 +268,9 @@ define <vscale x 4 x double> @trunc_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -288,9 +288,9 @@ define <vscale x 8 x double> @trunc_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/implicit-def-copy.ll b/llvm/test/CodeGen/RISCV/rvv/implicit-def-copy.ll
index 292f1deb2cce8d..9475989d463430 100644
--- a/llvm/test/CodeGen/RISCV/rvv/implicit-def-copy.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/implicit-def-copy.ll
@@ -11,8 +11,8 @@ define <vscale x 8 x i64> @vpload_nxv8i64(ptr %ptr, <vscale x 8 x i1> %m, i32 ze
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLE64_V_M8_MASK:%[0-9]+]]:vrm8nov0 = PseudoVLE64_V_M8_MASK $noreg, [[COPY2]], $v0, [[COPY]], 6 /* e64 */, 1 /* ta, mu */ :: (load unknown-size from %ir.ptr, align 64)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vmv0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[PseudoVLE64_V_M8_MASK:%[0-9]+]]:vrm8nov0 = PseudoVLE64_V_M8_MASK $noreg, [[COPY2]], [[COPY3]], [[COPY]], 6 /* e64 */, 1 /* ta, mu */ :: (load unknown-size from %ir.ptr, align 64)
   ; CHECK-NEXT:   $v8m8 = COPY [[PseudoVLE64_V_M8_MASK]]
   ; CHECK-NEXT:   PseudoRET implicit $v8m8
   %load = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index b15896580d4253..2a6f3c257b8a5d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -416,17 +416,17 @@ define <vscale x 4 x i1> @insert_nxv4i1_nxv1i1_0(<vscale x 4 x i1> %v, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v10, v9, 1, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
+; CHECK-NEXT:    vmv.v.v v10, v9
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
 ; CHECK-NEXT:    ret
   %vec = call <vscale x 4 x i1> @llvm.vector.insert.nxv1i1.nxv4i1(<vscale x 4 x i1> %v, <vscale x 1 x i1> %sv, i64 0)
   ret <vscale x 4 x i1> %vec
@@ -437,19 +437,19 @@ define <vscale x 4 x i1> @insert_nxv4i1_nxv1i1_2(<vscale x 4 x i1> %v, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v10, v9, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vslideup.vx v10, v9, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
 ; CHECK-NEXT:    ret
   %vec = call <vscale x 4 x i1> @llvm.vector.insert.nxv1i1.nxv4i1(<vscale x 4 x i1> %v, <vscale x 1 x i1> %sv, i64 2)
   ret <vscale x 4 x i1> %vec
diff --git a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
index 6d8763d34ec1b0..7e8553c480414e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
@@ -55,10 +55,10 @@ declare <vscale x 8 x i64> @llvm.vp.llrint.nxv8i64.nxv8f32(<vscale x 8 x float>,
 define <vscale x 16 x i64> @llrint_nxv16i64_nxv16f32(<vscale x 16 x float> %x, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: llrint_nxv16i64_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
index 9fa8807ed4add9..a4f19b36a6c088 100644
--- a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
@@ -117,10 +117,10 @@ define <vscale x 16 x iXLen> @lrint_nxv16f32(<vscale x 16 x float> %x, <vscale x
 ;
 ; RV64-i64-LABEL: lrint_nxv16f32:
 ; RV64-i64:       # %bb.0:
-; RV64-i64-NEXT:    vmv1r.v v24, v0
 ; RV64-i64-NEXT:    csrr a1, vlenb
 ; RV64-i64-NEXT:    srli a2, a1, 3
 ; RV64-i64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV64-i64-NEXT:    vmv1r.v v24, v0
 ; RV64-i64-NEXT:    vslidedown.vx v0, v0, a2
 ; RV64-i64-NEXT:    sub a2, a0, a1
 ; RV64-i64-NEXT:    sltu a3, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index e12f1cf7603b84..b71efc8c615044 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -2107,24 +2107,25 @@ define <vscale x 32 x i8> @mgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8>
 ;
 ; RV64-LABEL: mgather_baseidx_nxv32i8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vmv1r.v v16, v0
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf8 v24, v8
+; RV64-NEXT:    vsext.vf8 v16, v8
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64-NEXT:    vluxei64.v v12, (a0), v24, v0.t
+; RV64-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    srli a2, a1, 3
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a2
+; RV64-NEXT:    vslidedown.vx v8, v0, a2
+; RV64-NEXT:    vmv1r.v v24, v0
 ; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf8 v24, v9
+; RV64-NEXT:    vsext.vf8 v16, v9
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64-NEXT:    vluxei64.v v13, (a0), v24, v0.t
+; RV64-NEXT:    vmv1r.v v0, v8
+; RV64-NEXT:    vluxei64.v v13, (a0), v16, v0.t
 ; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v8, v16, a1
+; RV64-NEXT:    vslidedown.vx v9, v24, a1
 ; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v8, a2
+; RV64-NEXT:    vslidedown.vx v0, v9, a2
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v11
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
@@ -2132,7 +2133,7 @@ define <vscale x 32 x i8> @mgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8>
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v10
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64-NEXT:    vmv1r.v v0, v8
+; RV64-NEXT:    vmv1r.v v0, v9
 ; RV64-NEXT:    vluxei64.v v14, (a0), v16, v0.t
 ; RV64-NEXT:    vmv4r.v v8, v12
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll b/llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll
index da56e235ecd8ed..af30f7d7e6b4a4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll
@@ -11,16 +11,16 @@ define dso_local void @test(ptr nocapture noundef %var_99) {
 ; CHECK-NEXT:    addi a1, a1, %lo(.L__const.test.var_45)
 ; CHECK-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a1)
-; CHECK-NEXT:    li a1, 1
-; CHECK-NEXT:    vmul.vx v12, v8, a1
 ; CHECK-NEXT:    lui a1, %hi(.L__const.test.var_101)
 ; CHECK-NEXT:    addi a1, a1, %lo(.L__const.test.var_101)
-; CHECK-NEXT:    vle8.v v16, (a1)
-; CHECK-NEXT:    vmv.x.s a1, v12
+; CHECK-NEXT:    vle8.v v12, (a1)
+; CHECK-NEXT:    li a1, 1
+; CHECK-NEXT:    vmul.vx v16, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v16
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vssra.vv v12, v12, v8
 ; CHECK-NEXT:    vmsleu.vx v0, v8, a1
-; CHECK-NEXT:    vssra.vv v8, v16, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v12, v12, v0
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
index 8bc2334282653f..fadb4a156fb556 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
@@ -194,23 +194,23 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16(<vscale x 4 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -261,42 +261,42 @@ declare <vscale x 8 x half> @llvm.vp.nearbyint.nxv8f16(<vscale x 8 x half>, <vsc
 define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v10, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v13, v0
+; ZVFH-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFH-NEXT:    vmv1r.v v0, v10
-; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v17, v0
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -347,42 +347,42 @@ declare <vscale x 16 x half> @llvm.vp.nearbyint.nxv16f16(<vscale x 16 x half>, <
 define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI8_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v17, v0
+; ZVFH-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vmv1r.v v0, v12
-; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v17
+; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v25, v0
+; ZVFHMIN-NEXT:    vmflt.vf v25, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -433,32 +433,25 @@ declare <vscale x 32 x half> @llvm.vp.nearbyint.nxv32f16(<vscale x 32 x half>, <
 define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv32f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI10_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v25, v0
+; ZVFH-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
-; ZVFH-NEXT:    vmv1r.v v0, v16
-; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; ZVFH-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v25
+; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -467,54 +460,48 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a2, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    fsflags a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB10_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v9, v7
+; ZVFHMIN-NEXT:    vmflt.vf v9, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.nearbyint.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -538,12 +525,6 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -554,35 +535,31 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a2, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    fsflags a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB11_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB11_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -594,10 +571,6 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.nearbyint.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -692,21 +665,21 @@ declare <vscale x 4 x float> @llvm.vp.nearbyint.nxv4f32(<vscale x 4 x float>, <v
 define <vscale x 4 x float> @vp_nearbyint_nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.nearbyint.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x float> %v
@@ -736,21 +709,21 @@ declare <vscale x 8 x float> @llvm.vp.nearbyint.nxv8f32(<vscale x 8 x float>, <v
 define <vscale x 8 x float> @vp_nearbyint_nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.nearbyint.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x float> %v
@@ -780,21 +753,21 @@ declare <vscale x 16 x float> @llvm.vp.nearbyint.nxv16f32(<vscale x 16 x float>,
 define <vscale x 16 x float> @vp_nearbyint_nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.nearbyint.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x float> %v
@@ -866,21 +839,21 @@ declare <vscale x 2 x double> @llvm.vp.nearbyint.nxv2f64(<vscale x 2 x double>,
 define <vscale x 2 x double> @vp_nearbyint_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.nearbyint.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x double> %v
@@ -910,21 +883,21 @@ declare <vscale x 4 x double> @llvm.vp.nearbyint.nxv4f64(<vscale x 4 x double>,
 define <vscale x 4 x double> @vp_nearbyint_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.nearbyint.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x double> %v
@@ -954,21 +927,21 @@ declare <vscale x 7 x double> @llvm.vp.nearbyint.nxv7f64(<vscale x 7 x double>,
 define <vscale x 7 x double> @vp_nearbyint_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv7f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI28_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI28_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.nearbyint.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x double> %v
@@ -998,21 +971,21 @@ declare <vscale x 8 x double> @llvm.vp.nearbyint.nxv8f64(<vscale x 8 x double>,
 define <vscale x 8 x double> @vp_nearbyint_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI30_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI30_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.nearbyint.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x double> %v
@@ -1043,19 +1016,11 @@ declare <vscale x 16 x double> @llvm.vp.nearbyint.nxv16f64(<vscale x 16 x double
 define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v25, v0, a2
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -1063,62 +1028,36 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    lui a3, %hi(.LCPI32_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI32_0)(a3)
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfabs.v v16, v16, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    frflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    fsflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.nearbyint.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll
index 1ad8e2d66392a4..c99c8ad8dff482 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll
@@ -14,8 +14,8 @@ define <vscale x 1 x double> @foo(<vscale x 1 x double> %x, <vscale x 1 x double
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vr = COPY $v8
   ; CHECK-NEXT:   [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 32
   ; CHECK-NEXT:   [[SRLI:%[0-9]+]]:gprnox0 = SRLI killed [[SLLI]], 32
-  ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVFMUL_VV_M1_MASK:%[0-9]+]]:vrnov0 = nnan ninf nsz arcp contract afn reassoc nofpexcept PseudoVFMUL_VV_M1_MASK $noreg, [[COPY3]], [[COPY2]], $v0, 7, killed [[SRLI]], 6 /* e64 */, 1 /* ta, mu */, implicit $frm
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[PseudoVFMUL_VV_M1_MASK:%[0-9]+]]:vrnov0 = nnan ninf nsz arcp contract afn reassoc nofpexcept PseudoVFMUL_VV_M1_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], 7, killed [[SRLI]], 6 /* e64 */, 1 /* ta, mu */, implicit $frm
   ; CHECK-NEXT:   $v8 = COPY [[PseudoVFMUL_VV_M1_MASK]]
   ; CHECK-NEXT:   PseudoRET implicit $v8
   %1 = call fast <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> %x, <vscale x 1 x double> %y, <vscale x 1 x i1> %m, i32 %vl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
index f934127f978dc9..796e6ddd007023 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
@@ -176,21 +176,21 @@ define <vscale x 4 x half> @vp_rint_nxv4f16(<vscale x 4 x half> %va, <vscale x 4
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -237,38 +237,38 @@ declare <vscale x 8 x half> @llvm.vp.rint.nxv8f16(<vscale x 8 x half>, <vscale x
 define <vscale x 8 x half> @vp_rint_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_rint_nxv8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v10, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v13, v0
+; ZVFH-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFH-NEXT:    vmv1r.v v0, v10
-; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v17, v0
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -315,38 +315,38 @@ declare <vscale x 16 x half> @llvm.vp.rint.nxv16f16(<vscale x 16 x half>, <vscal
 define <vscale x 16 x half> @vp_rint_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_rint_nxv16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI8_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v17, v0
+; ZVFH-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vmv1r.v v0, v12
-; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v17
+; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v25, v0
+; ZVFHMIN-NEXT:    vmflt.vf v25, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -393,30 +393,23 @@ declare <vscale x 32 x half> @llvm.vp.rint.nxv32f16(<vscale x 32 x half>, <vscal
 define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_rint_nxv32f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI10_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v25, v0
+; ZVFH-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
-; ZVFH-NEXT:    vmv1r.v v0, v16
-; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; ZVFH-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v25
+; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -425,51 +418,44 @@ define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a2, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB10_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v9, v7
+; ZVFHMIN-NEXT:    vmflt.vf v9, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.rint.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -491,12 +477,6 @@ define <vscale x 32 x half> @vp_rint_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -507,33 +487,29 @@ define <vscale x 32 x half> @vp_rint_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a2, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB11_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB11_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -543,10 +519,6 @@ define <vscale x 32 x half> @vp_rint_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.rint.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -633,19 +605,19 @@ declare <vscale x 4 x float> @llvm.vp.rint.nxv4f32(<vscale x 4 x float>, <vscale
 define <vscale x 4 x float> @vp_rint_nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.rint.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x float> %v
@@ -673,19 +645,19 @@ declare <vscale x 8 x float> @llvm.vp.rint.nxv8f32(<vscale x 8 x float>, <vscale
 define <vscale x 8 x float> @vp_rint_nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.rint.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x float> %v
@@ -713,19 +685,19 @@ declare <vscale x 16 x float> @llvm.vp.rint.nxv16f32(<vscale x 16 x float>, <vsc
 define <vscale x 16 x float> @vp_rint_nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.rint.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x float> %v
@@ -791,19 +763,19 @@ declare <vscale x 2 x double> @llvm.vp.rint.nxv2f64(<vscale x 2 x double>, <vsca
 define <vscale x 2 x double> @vp_rint_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.rint.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x double> %v
@@ -831,19 +803,19 @@ declare <vscale x 4 x double> @llvm.vp.rint.nxv4f64(<vscale x 4 x double>, <vsca
 define <vscale x 4 x double> @vp_rint_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.rint.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x double> %v
@@ -871,19 +843,19 @@ declare <vscale x 7 x double> @llvm.vp.rint.nxv7f64(<vscale x 7 x double>, <vsca
 define <vscale x 7 x double> @vp_rint_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv7f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI28_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI28_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.rint.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x double> %v
@@ -911,19 +883,19 @@ declare <vscale x 8 x double> @llvm.vp.rint.nxv8f64(<vscale x 8 x double>, <vsca
 define <vscale x 8 x double> @vp_rint_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI30_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI30_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.rint.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x double> %v
@@ -952,19 +924,11 @@ declare <vscale x 16 x double> @llvm.vp.rint.nxv16f64(<vscale x 16 x double>, <v
 define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v25, v0, a2
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -972,47 +936,32 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    lui a3, %hi(.LCPI32_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI32_0)(a3)
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.rint.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
index eb4994914fad9a..43fd54cd63df61 100644
--- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
@@ -194,23 +194,23 @@ define <vscale x 4 x half> @vp_round_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -261,42 +261,42 @@ declare <vscale x 8 x half> @llvm.vp.round.nxv8f16(<vscale x 8 x half>, <vscale
 define <vscale x 8 x half> @vp_round_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v10, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v13, v0
+; ZVFH-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
-; ZVFH-NEXT:    vmv1r.v v0, v10
-; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v17, v0
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -347,42 +347,42 @@ declare <vscale x 16 x half> @llvm.vp.round.nxv16f16(<vscale x 16 x half>, <vsca
 define <vscale x 16 x half> @vp_round_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI8_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v17, v0
+; ZVFH-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
-; ZVFH-NEXT:    vmv1r.v v0, v12
-; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v17
+; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v25, v0
+; ZVFHMIN-NEXT:    vmflt.vf v25, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -433,32 +433,25 @@ declare <vscale x 32 x half> @llvm.vp.round.nxv32f16(<vscale x 32 x half>, <vsca
 define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv32f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI10_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v25, v0
+; ZVFH-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
-; ZVFH-NEXT:    vmv1r.v v0, v16
-; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v25
+; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -467,55 +460,48 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a2, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a2, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB10_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v9, v7
+; ZVFHMIN-NEXT:    vmflt.vf v9, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.round.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -539,12 +525,6 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -555,35 +535,31 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a2, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a2, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB11_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB11_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -595,10 +571,6 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.round.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -693,21 +665,21 @@ declare <vscale x 4 x float> @llvm.vp.round.nxv4f32(<vscale x 4 x float>, <vscal
 define <vscale x 4 x float> @vp_round_nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.round.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x float> %v
@@ -737,21 +709,21 @@ declare <vscale x 8 x float> @llvm.vp.round.nxv8f32(<vscale x 8 x float>, <vscal
 define <vscale x 8 x float> @vp_round_nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.round.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x float> %v
@@ -781,21 +753,21 @@ declare <vscale x 16 x float> @llvm.vp.round.nxv16f32(<vscale x 16 x float>, <vs
 define <vscale x 16 x float> @vp_round_nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.round.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x float> %v
@@ -867,21 +839,21 @@ declare <vscale x 2 x double> @llvm.vp.round.nxv2f64(<vscale x 2 x double>, <vsc
 define <vscale x 2 x double> @vp_round_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.round.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x double> %v
@@ -911,21 +883,21 @@ declare <vscale x 4 x double> @llvm.vp.round.nxv4f64(<vscale x 4 x double>, <vsc
 define <vscale x 4 x double> @vp_round_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.round.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x double> %v
@@ -955,21 +927,21 @@ declare <vscale x 7 x double> @llvm.vp.round.nxv7f64(<vscale x 7 x double>, <vsc
 define <vscale x 7 x double> @vp_round_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv7f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI28_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI28_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.round.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x double> %v
@@ -999,21 +971,21 @@ declare <vscale x 8 x double> @llvm.vp.round.nxv8f64(<vscale x 8 x double>, <vsc
 define <vscale x 8 x double> @vp_round_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI30_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI30_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.round.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x double> %v
@@ -1044,19 +1016,11 @@ declare <vscale x 16 x double> @llvm.vp.round.nxv16f64(<vscale x 16 x double>, <
 define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v25, v0, a2
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -1064,51 +1028,36 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    lui a3, %hi(.LCPI32_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI32_0)(a3)
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a2, 4
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.round.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
index f366a2922d079e..8214159e674e10 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
@@ -194,23 +194,23 @@ define <vscale x 4 x half> @vp_roundeven_nxv4f16(<vscale x 4 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -261,42 +261,42 @@ declare <vscale x 8 x half> @llvm.vp.roundeven.nxv8f16(<vscale x 8 x half>, <vsc
 define <vscale x 8 x half> @vp_roundeven_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v10, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v13, v0
+; ZVFH-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
-; ZVFH-NEXT:    vmv1r.v v0, v10
-; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v17, v0
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -347,42 +347,42 @@ declare <vscale x 16 x half> @llvm.vp.roundeven.nxv16f16(<vscale x 16 x half>, <
 define <vscale x 16 x half> @vp_roundeven_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI8_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v17, v0
+; ZVFH-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
-; ZVFH-NEXT:    vmv1r.v v0, v12
-; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v17
+; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v25, v0
+; ZVFHMIN-NEXT:    vmflt.vf v25, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -433,32 +433,25 @@ declare <vscale x 32 x half> @llvm.vp.roundeven.nxv32f16(<vscale x 32 x half>, <
 define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv32f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI10_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v25, v0
+; ZVFH-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
-; ZVFH-NEXT:    vmv1r.v v0, v16
-; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v25
+; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -467,55 +460,48 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a2, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a2, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB10_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v9, v7
+; ZVFHMIN-NEXT:    vmflt.vf v9, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.roundeven.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -539,12 +525,6 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -555,35 +535,31 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a2, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a2, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB11_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB11_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -595,10 +571,6 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.roundeven.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -693,21 +665,21 @@ declare <vscale x 4 x float> @llvm.vp.roundeven.nxv4f32(<vscale x 4 x float>, <v
 define <vscale x 4 x float> @vp_roundeven_nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.roundeven.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x float> %v
@@ -737,21 +709,21 @@ declare <vscale x 8 x float> @llvm.vp.roundeven.nxv8f32(<vscale x 8 x float>, <v
 define <vscale x 8 x float> @vp_roundeven_nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.roundeven.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x float> %v
@@ -781,21 +753,21 @@ declare <vscale x 16 x float> @llvm.vp.roundeven.nxv16f32(<vscale x 16 x float>,
 define <vscale x 16 x float> @vp_roundeven_nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.roundeven.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x float> %v
@@ -867,21 +839,21 @@ declare <vscale x 2 x double> @llvm.vp.roundeven.nxv2f64(<vscale x 2 x double>,
 define <vscale x 2 x double> @vp_roundeven_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.roundeven.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x double> %v
@@ -911,21 +883,21 @@ declare <vscale x 4 x double> @llvm.vp.roundeven.nxv4f64(<vscale x 4 x double>,
 define <vscale x 4 x double> @vp_roundeven_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.roundeven.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x double> %v
@@ -955,21 +927,21 @@ declare <vscale x 7 x double> @llvm.vp.roundeven.nxv7f64(<vscale x 7 x double>,
 define <vscale x 7 x double> @vp_roundeven_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv7f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI28_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI28_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.roundeven.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x double> %v
@@ -999,21 +971,21 @@ declare <vscale x 8 x double> @llvm.vp.roundeven.nxv8f64(<vscale x 8 x double>,
 define <vscale x 8 x double> @vp_roundeven_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI30_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI30_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.roundeven.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x double> %v
@@ -1044,19 +1016,11 @@ declare <vscale x 16 x double> @llvm.vp.roundeven.nxv16f64(<vscale x 16 x double
 define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v25, v0, a2
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -1064,51 +1028,36 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    lui a3, %hi(.LCPI32_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI32_0)(a3)
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a2, 0
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.roundeven.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
index 79c940bdf089a5..e8ee307ac2e4ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
@@ -194,23 +194,23 @@ define <vscale x 4 x half> @vp_roundtozero_nxv4f16(<vscale x 4 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v9
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -261,42 +261,42 @@ declare <vscale x 8 x half> @llvm.vp.roundtozero.nxv8f16(<vscale x 8 x half>, <v
 define <vscale x 8 x half> @vp_roundtozero_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v10, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v13, v0
+; ZVFH-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
-; ZVFH-NEXT:    vmv1r.v v0, v10
-; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v17, v0
+; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v10
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -347,42 +347,42 @@ declare <vscale x 16 x half> @llvm.vp.roundtozero.nxv16f16(<vscale x 16 x half>,
 define <vscale x 16 x half> @vp_roundtozero_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI8_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v17, v0
+; ZVFH-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
-; ZVFH-NEXT:    vmv1r.v v0, v12
-; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v17
+; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v25, v0
+; ZVFHMIN-NEXT:    vmflt.vf v25, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -433,32 +433,25 @@ declare <vscale x 32 x half> @llvm.vp.roundtozero.nxv32f16(<vscale x 32 x half>,
 define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv32f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI10_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
+; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; ZVFH-NEXT:    vmv1r.v v25, v0
+; ZVFH-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
-; ZVFH-NEXT:    vmv1r.v v0, v16
-; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v25
+; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
-; ZVFH-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
-; ZVFH-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -467,55 +460,48 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a2, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a2, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB10_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v9, v7
+; ZVFHMIN-NEXT:    vmflt.vf v9, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.roundtozero.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -539,12 +525,6 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -555,35 +535,31 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    lui a2, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v13, v0
+; ZVFHMIN-NEXT:    vmflt.vf v13, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a2, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB11_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB11_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -595,10 +571,6 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.roundtozero.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -693,21 +665,21 @@ declare <vscale x 4 x float> @llvm.vp.roundtozero.nxv4f32(<vscale x 4 x float>,
 define <vscale x 4 x float> @vp_roundtozero_nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.roundtozero.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x float> %v
@@ -737,21 +709,21 @@ declare <vscale x 8 x float> @llvm.vp.roundtozero.nxv8f32(<vscale x 8 x float>,
 define <vscale x 8 x float> @vp_roundtozero_nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.roundtozero.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x float> %v
@@ -781,21 +753,21 @@ declare <vscale x 16 x float> @llvm.vp.roundtozero.nxv16f32(<vscale x 16 x float
 define <vscale x 16 x float> @vp_roundtozero_nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.roundtozero.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x float> %v
@@ -867,21 +839,21 @@ declare <vscale x 2 x double> @llvm.vp.roundtozero.nxv2f64(<vscale x 2 x double>
 define <vscale x 2 x double> @vp_roundtozero_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    vfabs.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v10, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.roundtozero.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x double> %v
@@ -911,21 +883,21 @@ declare <vscale x 4 x double> @llvm.vp.roundtozero.nxv4f64(<vscale x 4 x double>
 define <vscale x 4 x double> @vp_roundtozero_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v12, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.roundtozero.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x double> %v
@@ -955,21 +927,21 @@ declare <vscale x 7 x double> @llvm.vp.roundtozero.nxv7f64(<vscale x 7 x double>
 define <vscale x 7 x double> @vp_roundtozero_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv7f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI28_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI28_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.roundtozero.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x double> %v
@@ -999,21 +971,21 @@ declare <vscale x 8 x double> @llvm.vp.roundtozero.nxv8f64(<vscale x 8 x double>
 define <vscale x 8 x double> @vp_roundtozero_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI30_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI30_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.roundtozero.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x double> %v
@@ -1044,19 +1016,11 @@ declare <vscale x 16 x double> @llvm.vp.roundtozero.nxv16f64(<vscale x 16 x doub
 define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v25, v0, a2
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -1064,51 +1028,36 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; CHECK-NEXT:    lui a3, %hi(.LCPI32_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI32_0)(a3)
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v5, v0
+; CHECK-NEXT:    vmflt.vf v5, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a2, 1
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v6, v7
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.roundtozero.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
index 442419efb83caf..ec1e306797a8b1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -15,7 +15,7 @@ body: |
     ; CHECK-NEXT: %avl:gprnox0 = COPY $x1
     ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 5 /* e32 */
     ; CHECK-NEXT: $v0 = COPY %mask
-    ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %false, %true, %avl, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, $v0, %avl, 5 /* e32 */
     %false:vr = COPY $v8
     %true:vr = COPY $v9
     %avl:gprnox0 = COPY $x1
@@ -60,7 +60,7 @@ body: |
     ; CHECK-NEXT: %avl:gprnox0 = COPY $x1
     ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 5 /* e32 */
     ; CHECK-NEXT: $v0 = COPY %mask
-    ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %false, %true, %avl, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, %avl, 5 /* e32 */
     %false:vr = COPY $v8
     %pt:vrnov0 = COPY $v8
     %true:vr = COPY $v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll
index 31fd5bdbd31fd7..9ddd4f80a97cd5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll
@@ -15,8 +15,8 @@ define void @vpmerge_vpload_store(<vscale x 2 x i32> %passthru, ptr %p, <vscale
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
-  ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLE32_V_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_M1_MASK [[COPY3]], [[COPY2]], $v0, [[COPY]], 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 8)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[PseudoVLE32_V_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 8)
   ; CHECK-NEXT:   VS1R_V killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]] :: (store (<vscale x 1 x s64>) into %ir.p)
   ; CHECK-NEXT:   PseudoRET
   %a = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr %p, <vscale x 2 x i1> splat (i1 -1), i32 %vl)
@@ -34,8 +34,8 @@ define void @vpselect_vpload_store(<vscale x 2 x i32> %passthru, ptr %p, <vscale
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
-  ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLE32_V_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_M1_MASK [[COPY3]], [[COPY2]], $v0, [[COPY]], 5 /* e32 */, 1 /* ta, mu */ :: (load unknown-size from %ir.p, align 8)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[PseudoVLE32_V_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 5 /* e32 */, 1 /* ta, mu */ :: (load unknown-size from %ir.p, align 8)
   ; CHECK-NEXT:   VS1R_V killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]] :: (store (<vscale x 1 x s64>) into %ir.p)
   ; CHECK-NEXT:   PseudoRET
   %a = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr %p, <vscale x 2 x i1> splat (i1 -1), i32 %vl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 897bfdea69f1f4..cddd371a25948a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -2194,14 +2194,10 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    addi sp, sp, -16
 ; ZVFH-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 4
-; ZVFH-NEXT:    sub sp, sp, a1
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFH-NEXT:    vmv1r.v v24, v0
-; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    slli a1, a1, 3
-; ZVFH-NEXT:    add a1, sp, a1
-; ZVFH-NEXT:    addi a1, a1, 16
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    addi a1, sp, 16
 ; ZVFH-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
@@ -2214,10 +2210,9 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    addi a5, a5, -1
 ; ZVFH-NEXT:    and a4, a5, a4
 ; ZVFH-NEXT:    srli a1, a1, 1
-; ZVFH-NEXT:    vl8re16.v v0, (a0)
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vslidedown.vx v0, v24, a1
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vmv1r.v v6, v0
+; ZVFH-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFH-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v7, v16, v8, v0.t
 ; ZVFH-NEXT:    bltu a2, a3, .LBB85_2
@@ -2225,21 +2220,16 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    mv a2, a3
 ; ZVFH-NEXT:  .LBB85_2:
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; ZVFH-NEXT:    vmv1r.v v0, v24
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    add a0, sp, a0
-; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vmv1r.v v0, v6
 ; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vmfeq.vv v16, v8, v24, v0.t
 ; ZVFH-NEXT:    add a0, a1, a1
 ; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; ZVFH-NEXT:    vslideup.vx v16, v7, a1
 ; ZVFH-NEXT:    vmv.v.v v0, v16
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    add sp, sp, a0
 ; ZVFH-NEXT:    addi sp, sp, 16
 ; ZVFH-NEXT:    ret
@@ -2249,14 +2239,13 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 34
+; ZVFHMIN-NEXT:    li a3, 26
 ; ZVFHMIN-NEXT:    mul a1, a1, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 26 * vlenb
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 18
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    slli a3, a1, 3
+; ZVFHMIN-NEXT:    add a1, a3, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -2264,7 +2253,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; ZVFHMIN-NEXT:    slli a1, a3, 3
 ; ZVFHMIN-NEXT:    add a1, a0, a1
-; ZVFHMIN-NEXT:    vl8re16.v v16, (a1)
+; ZVFHMIN-NEXT:    vl8re16.v v24, (a1)
 ; ZVFHMIN-NEXT:    slli a5, a3, 2
 ; ZVFHMIN-NEXT:    sub a1, a2, a5
 ; ZVFHMIN-NEXT:    sltu a4, a2, a1
@@ -2277,6 +2266,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    and a7, a7, a1
 ; ZVFHMIN-NEXT:    srli a1, a3, 1
 ; ZVFHMIN-NEXT:    csrr t0, vlenb
+; ZVFHMIN-NEXT:    li t1, 25
+; ZVFHMIN-NEXT:    mul t0, t0, t1
 ; ZVFHMIN-NEXT:    add t0, sp, t0
 ; ZVFHMIN-NEXT:    addi t0, t0, 16
 ; ZVFHMIN-NEXT:    vs1r.v v0, (t0) # Unknown-size Folded Spill
@@ -2284,125 +2275,114 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li t0, 26
-; ZVFHMIN-NEXT:    mul a0, a0, t0
+; ZVFHMIN-NEXT:    slli t0, a0, 4
+; ZVFHMIN-NEXT:    add a0, t0, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v7, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v6, v7
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vmv4r.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li t0, 10
-; ZVFHMIN-NEXT:    mul a0, a0, t0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a7, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v26, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    bltu a6, a4, .LBB85_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a6, a4
 ; ZVFHMIN-NEXT:  .LBB85_2:
+; ZVFHMIN-NEXT:    vsetvli zero, a7, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v20, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a7, 10
-; ZVFHMIN-NEXT:    mul a0, a0, a7
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v6
+; ZVFHMIN-NEXT:    vmfeq.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    add a0, a3, a3
 ; ZVFHMIN-NEXT:    bltu a2, a5, .LBB85_4
 ; ZVFHMIN-NEXT:  # %bb.3:
 ; ZVFHMIN-NEXT:    mv a2, a5
 ; ZVFHMIN-NEXT:  .LBB85_4:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v6, v26, a3
+; ZVFHMIN-NEXT:    vslideup.vx v16, v20, a3
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vs1r.v v16, (a5) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a5, a2, a4
 ; ZVFHMIN-NEXT:    sltu a6, a2, a5
 ; ZVFHMIN-NEXT:    addi a6, a6, -1
 ; ZVFHMIN-NEXT:    and a5, a6, a5
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
+; ZVFHMIN-NEXT:    li a7, 25
+; ZVFHMIN-NEXT:    mul a6, a6, a7
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
 ; ZVFHMIN-NEXT:    vl1r.v v8, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v7, v8
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a3
+; ZVFHMIN-NEXT:    addi a6, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a6) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    li a7, 18
-; ZVFHMIN-NEXT:    mul a6, a6, a7
+; ZVFHMIN-NEXT:    slli a7, a6, 3
+; ZVFHMIN-NEXT:    add a6, a7, a6
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    li a7, 10
-; ZVFHMIN-NEXT:    mul a6, a6, a7
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    li a7, 26
-; ZVFHMIN-NEXT:    mul a6, a6, a7
+; ZVFHMIN-NEXT:    slli a7, a6, 4
+; ZVFHMIN-NEXT:    add a6, a7, a6
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vl8r.v v0, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    li a6, 10
-; ZVFHMIN-NEXT:    mul a5, a5, a6
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmfeq.vv v5, v16, v8, v0.t
+; ZVFHMIN-NEXT:    addi a5, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a5) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmfeq.vv v6, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a2, a4, .LBB85_6
 ; ZVFHMIN-NEXT:  # %bb.5:
 ; ZVFHMIN-NEXT:    mv a2, a4
 ; ZVFHMIN-NEXT:  .LBB85_6:
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    li a5, 26
-; ZVFHMIN-NEXT:    mul a4, a4, a5
+; ZVFHMIN-NEXT:    slli a5, a4, 4
+; ZVFHMIN-NEXT:    add a4, a5, a4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    li a4, 25
+; ZVFHMIN-NEXT:    mul a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v5, a3
+; ZVFHMIN-NEXT:    vslideup.vx v8, v6, a3
 ; ZVFHMIN-NEXT:    add a0, a1, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v6, a1
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vslideup.vx v8, v9, a1
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 34
+; ZVFHMIN-NEXT:    li a1, 26
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -3498,12 +3478,6 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a3, 40
 ; CHECK-NEXT:    mul a1, a1, a3
@@ -3512,48 +3486,45 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul t2, a3, a1
+; CHECK-NEXT:    mul t0, a3, a1
 ; CHECK-NEXT:    slli t1, a3, 3
 ; CHECK-NEXT:    srli a4, a3, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v7, v0, a4
+; CHECK-NEXT:    vslidedown.vx v6, v0, a4
 ; CHECK-NEXT:    srli a1, a3, 3
 ; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    add a5, a2, t1
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    slli t0, a3, 4
+; CHECK-NEXT:    slli t2, a3, 4
 ; CHECK-NEXT:    slli a5, a3, 1
+; CHECK-NEXT:    vmv1r.v v4, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a1
 ; CHECK-NEXT:    mv a7, a6
 ; CHECK-NEXT:    bltu a6, a5, .LBB171_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a7, a5
 ; CHECK-NEXT:  .LBB171_2:
-; CHECK-NEXT:    add t2, a2, t2
-; CHECK-NEXT:    add t1, a0, t1
 ; CHECK-NEXT:    add t0, a2, t0
-; CHECK-NEXT:    vl8re64.v v16, (a2)
+; CHECK-NEXT:    add t1, a0, t1
+; CHECK-NEXT:    vslidedown.vx v5, v6, a1
+; CHECK-NEXT:    add t2, a2, t2
+; CHECK-NEXT:    vl8re64.v v24, (a2)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sub a2, a7, a3
 ; CHECK-NEXT:    sltu t3, a7, a2
 ; CHECK-NEXT:    addi t3, t3, -1
 ; CHECK-NEXT:    and a2, t3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v6, v16, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v7, v16, v8, v0.t
 ; CHECK-NEXT:    bltu a7, a3, .LBB171_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a7, a3
 ; CHECK-NEXT:  .LBB171_4:
-; CHECK-NEXT:    vl8re64.v v8, (t2)
+; CHECK-NEXT:    vl8re64.v v8, (t0)
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 5
 ; CHECK-NEXT:    add a2, sp, a2
@@ -3561,16 +3532,14 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v8, (t1)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li t1, 24
-; CHECK-NEXT:    mul a2, a2, t1
+; CHECK-NEXT:    li t0, 24
+; CHECK-NEXT:    mul a2, a2, t0
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v18, v7, a1
-; CHECK-NEXT:    vl8re64.v v8, (t0)
+; CHECK-NEXT:    vl8re64.v v8, (t2)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -3578,7 +3547,6 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a2, 40
 ; CHECK-NEXT:    mul a0, a0, a2
@@ -3586,42 +3554,43 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v17, v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v4
+; CHECK-NEXT:    vmfeq.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    add a0, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK-NEXT:    sub a0, a6, a5
 ; CHECK-NEXT:    sltu a2, a6, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    vslideup.vx v17, v6, a1
+; CHECK-NEXT:    vslideup.vx v16, v7, a1
 ; CHECK-NEXT:    mv a2, a0
 ; CHECK-NEXT:    bltu a0, a3, .LBB171_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB171_6:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v16, v24, v8, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v17, v8, v24, v0.t
 ; CHECK-NEXT:    add a2, a4, a1
 ; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vx v17, v16, a4
+; CHECK-NEXT:    vslideup.vx v16, v17, a4
 ; CHECK-NEXT:    sub a2, a0, a3
 ; CHECK-NEXT:    sltu a0, a0, a2
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v18
+; CHECK-NEXT:    vmv1r.v v0, v5
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
@@ -3633,13 +3602,13 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v16, v8, v24, v0.t
+; CHECK-NEXT:    vmfeq.vv v17, v8, v24, v0.t
 ; CHECK-NEXT:    slli a0, a1, 1
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v17, v16, a0
-; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vslideup.vx v16, v17, a0
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 48
 ; CHECK-NEXT:    mul a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 7fd77c050b2957..0dede98ec80672 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -1092,7 +1092,7 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v6, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
@@ -1102,15 +1102,15 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a4, a0, a1
 ; CHECK-NEXT:    vl8r.v v8, (a4)
-; CHECK-NEXT:    vl8r.v v0, (a0)
+; CHECK-NEXT:    vl8r.v v24, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    sub a0, a3, a1
+; CHECK-NEXT:    sltu a4, a3, a0
 ; CHECK-NEXT:    vlm.v v0, (a2)
-; CHECK-NEXT:    sltu a2, a3, a0
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a0, a4, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v7, v16, v8, v0.t
 ; CHECK-NEXT:    bltu a3, a1, .LBB96_2
@@ -1118,7 +1118,7 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:  .LBB96_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
@@ -1141,7 +1141,7 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 define <vscale x 128 x i1> @icmp_eq_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: icmp_eq_vx_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -1151,16 +1151,16 @@ define <vscale x 128 x i1> @icmp_eq_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b,
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; CHECK-NEXT:    vmseq.vx v24, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a1, .LBB97_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB97_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmv1r.v v8, v25
+; CHECK-NEXT:    vmv1r.v v8, v24
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 128 x i8> poison, i8 %b, i8 0
   %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
@@ -1171,7 +1171,7 @@ define <vscale x 128 x i1> @icmp_eq_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b,
 define <vscale x 128 x i1> @icmp_eq_vx_swap_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: icmp_eq_vx_swap_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -1181,16 +1181,16 @@ define <vscale x 128 x i1> @icmp_eq_vx_swap_nxv128i8(<vscale x 128 x i8> %va, i8
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; CHECK-NEXT:    vmseq.vx v24, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a1, .LBB98_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB98_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmv1r.v v8, v25
+; CHECK-NEXT:    vmv1r.v v8, v24
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 128 x i8> poison, i8 %b, i8 0
   %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
@@ -2239,14 +2239,10 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
@@ -2259,10 +2255,9 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a4, a5, a4
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    vl8re32.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v24, a1
+; CHECK-NEXT:    vl8re32.v v24, (a0)
+; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v7, v16, v8, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB189_2
@@ -2270,21 +2265,16 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB189_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    add a0, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v7, a1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -2295,10 +2285,10 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: icmp_eq_vx_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    srli a2, a3, 2
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    sub a4, a1, a3
@@ -2306,17 +2296,17 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a4, a5, a4
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; CHECK-NEXT:    vmseq.vx v24, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a1, a3, .LBB190_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a3
 ; CHECK-NEXT:  .LBB190_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    add a0, a2, a2
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v16, v25, a2
+; CHECK-NEXT:    vslideup.vx v16, v24, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
@@ -2328,10 +2318,10 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
 define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: icmp_eq_vx_swap_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    srli a2, a3, 2
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    sub a4, a1, a3
@@ -2339,17 +2329,17 @@ define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a4, a5, a4
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; CHECK-NEXT:    vmseq.vx v24, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a1, a3, .LBB191_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a3
 ; CHECK-NEXT:  .LBB191_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    add a0, a2, a2
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v16, v25, a2
+; CHECK-NEXT:    vslideup.vx v16, v24, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
index 032d32109933f3..7daa0369185800 100644
--- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
@@ -49,8 +49,8 @@ define <8 x i8> @v4i8_2(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrsub.vi v12, v11, 7
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vrsub.vi v8, v11, 3
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -174,8 +174,8 @@ define <8 x i16> @v4i16_2(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrsub.vi v12, v11, 7
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vrsub.vi v8, v11, 3
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -234,10 +234,10 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI15_0)
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle16.v v20, (a0)
-; CHECK-NEXT:    vmv2r.v v16, v10
+; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    vmv2r.v v20, v10
 ; CHECK-NEXT:    vmv2r.v v12, v8
-; CHECK-NEXT:    vrgather.vv v8, v12, v20
+; CHECK-NEXT:    vrgather.vv v8, v12, v16
 ; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    vrsub.vi v12, v12, 15
 ; CHECK-NEXT:    lui a0, 16
@@ -245,7 +245,7 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v12, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v20, v12, v0.t
 ; CHECK-NEXT:    ret
   %v32i16 = shufflevector <16 x i16> %a, <16 x i16> %b,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x i16> %v32i16
@@ -329,18 +329,18 @@ define <16 x i32> @v8i32_2(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: v8i32_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv2r.v v16, v10
-; CHECK-NEXT:    vmv2r.v v12, v8
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vid.v v14
-; CHECK-NEXT:    vrsub.vi v18, v14, 15
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrsub.vi v18, v10, 15
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v18
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vrsub.vi v12, v14, 7
+; CHECK-NEXT:    vrsub.vi v8, v10, 7
 ; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v12, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
   %v16i32 = shufflevector <8 x i32> %a, <8 x i32> %b,  <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <16 x i32> %v16i32
@@ -492,8 +492,8 @@ define <8 x half> @v4f16_2(<4 x half> %a, <4 x half> %b) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrsub.vi v12, v11, 7
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vrsub.vi v8, v11, 3
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -638,18 +638,18 @@ define <16 x float> @v8f32_2(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: v8f32_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv2r.v v16, v10
-; CHECK-NEXT:    vmv2r.v v12, v8
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vid.v v14
-; CHECK-NEXT:    vrsub.vi v18, v14, 15
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrsub.vi v18, v10, 15
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v18
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vrsub.vi v12, v14, 7
+; CHECK-NEXT:    vrsub.vi v8, v10, 7
 ; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v12, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
   %v16f32 = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <16 x float> %v16f32
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index 9046c861c3367a..0ac316a430e9f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -4865,19 +4865,19 @@ declare <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32>, <4 x i32>, metadata, <4 x i1>, i
 define void @sink_splat_vp_icmp(ptr nocapture %x, i32 signext %y, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_icmp:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a3, 1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:  .LBB102_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmseq.vx v0, v10, a1, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmseq.vx v0, v9, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v9, (a0), v0.t
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    bne a0, a3, .LBB102_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -4906,19 +4906,19 @@ declare <4 x i1> @llvm.vp.fcmp.v4f32(<4 x float>, <4 x float>, metadata, <4 x i1
 define void @sink_splat_vp_fcmp(ptr nocapture %x, float %y, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_fcmp:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:  .LBB103_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmfeq.vf v0, v10, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vf v0, v9, fa0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v9, (a0), v0.t
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    bne a0, a2, .LBB103_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
diff --git a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
index b7fe722958bfb8..76493d9514a8dc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
@@ -13,14 +13,13 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    srli a1, a0, 1
-; CHECK-NEXT:    vsll.vv v10, v8, v9
-; CHECK-NEXT:    vsra.vv v9, v10, v9
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
-; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    slli a0, a0, 63
-; CHECK-NEXT:    vmerge.vxm v9, v9, a0, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
+; CHECK-NEXT:    vmerge.vxm v10, v10, a0, v0
+; CHECK-NEXT:    vsll.vv v11, v8, v9
+; CHECK-NEXT:    vsra.vv v9, v11, v9
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
+; CHECK-NEXT:    vmerge.vvm v8, v11, v10, v0
 ; CHECK-NEXT:    ret
   %tmp = call <2 x i64> @llvm.sshl.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
   ret <2 x i64> %tmp
@@ -33,13 +32,12 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    lui a0, 524288
 ; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vsll.vv v10, v8, v9
-; CHECK-NEXT:    vsra.vv v9, v10, v9
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vmerge.vxm v9, v9, a0, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmerge.vxm v10, v10, a0, v0
+; CHECK-NEXT:    vsll.vv v11, v8, v9
+; CHECK-NEXT:    vsra.vv v9, v11, v9
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
+; CHECK-NEXT:    vmerge.vvm v8, v11, v10, v0
 ; CHECK-NEXT:    ret
   %tmp = call <4 x i32> @llvm.sshl.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %tmp
@@ -52,13 +50,12 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    lui a0, 8
 ; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vsll.vv v10, v8, v9
-; CHECK-NEXT:    vsra.vv v9, v10, v9
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vmerge.vxm v9, v9, a0, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmerge.vxm v10, v10, a0, v0
+; CHECK-NEXT:    vsll.vv v11, v8, v9
+; CHECK-NEXT:    vsra.vv v9, v11, v9
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
+; CHECK-NEXT:    vmerge.vvm v8, v11, v10, v0
 ; CHECK-NEXT:    ret
   %tmp = call <8 x i16> @llvm.sshl.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %tmp
@@ -70,14 +67,13 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vsll.vv v10, v8, v9
-; CHECK-NEXT:    vsra.vv v9, v10, v9
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
-; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmerge.vxm v9, v9, a0, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
+; CHECK-NEXT:    vmerge.vxm v10, v10, a0, v0
+; CHECK-NEXT:    vsll.vv v11, v8, v9
+; CHECK-NEXT:    vsra.vv v9, v11, v9
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
+; CHECK-NEXT:    vmerge.vvm v8, v11, v10, v0
 ; CHECK-NEXT:    ret
   %tmp = call <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %tmp
@@ -95,14 +91,13 @@ define <vscale x 2 x i64> @vec_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64>
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    srli a1, a0, 1
-; CHECK-NEXT:    vsll.vv v12, v8, v10
-; CHECK-NEXT:    vsra.vv v14, v12, v10
-; CHECK-NEXT:    vmsne.vv v10, v8, v14
-; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    slli a0, a0, 63
-; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vmerge.vxm v12, v12, a0, v0
+; CHECK-NEXT:    vsll.vv v14, v8, v10
+; CHECK-NEXT:    vsra.vv v10, v14, v10
+; CHECK-NEXT:    vmsne.vv v0, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v14, v12, v0
 ; CHECK-NEXT:    ret
   %tmp = call <vscale x 2 x i64> @llvm.sshl.sat.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y)
   ret <vscale x 2 x i64> %tmp
@@ -115,13 +110,12 @@ define <vscale x 4 x i32> @vec_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32>
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    lui a0, 524288
 ; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vsll.vv v12, v8, v10
-; CHECK-NEXT:    vsra.vv v14, v12, v10
-; CHECK-NEXT:    vmsne.vv v10, v8, v14
-; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vmerge.vxm v12, v12, a0, v0
+; CHECK-NEXT:    vsll.vv v14, v8, v10
+; CHECK-NEXT:    vsra.vv v10, v14, v10
+; CHECK-NEXT:    vmsne.vv v0, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v14, v12, v0
 ; CHECK-NEXT:    ret
   %tmp = call <vscale x 4 x i32> @llvm.sshl.sat.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y)
   ret <vscale x 4 x i32> %tmp
@@ -134,13 +128,12 @@ define <vscale x 8 x i16> @vec_nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16>
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    lui a0, 8
 ; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vsll.vv v12, v8, v10
-; CHECK-NEXT:    vsra.vv v14, v12, v10
-; CHECK-NEXT:    vmsne.vv v10, v8, v14
-; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vmerge.vxm v12, v12, a0, v0
+; CHECK-NEXT:    vsll.vv v14, v8, v10
+; CHECK-NEXT:    vsra.vv v10, v14, v10
+; CHECK-NEXT:    vmsne.vv v0, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v14, v12, v0
 ; CHECK-NEXT:    ret
   %tmp = call <vscale x 8 x i16> @llvm.sshl.sat.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y)
   ret <vscale x 8 x i16> %tmp
@@ -152,14 +145,13 @@ define <vscale x 16 x i8> @vec_nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8>
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vsll.vv v12, v8, v10
-; CHECK-NEXT:    vsra.vv v14, v12, v10
-; CHECK-NEXT:    vmsne.vv v10, v8, v14
-; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vmerge.vxm v12, v12, a0, v0
+; CHECK-NEXT:    vsll.vv v14, v8, v10
+; CHECK-NEXT:    vsra.vv v10, v14, v10
+; CHECK-NEXT:    vmsne.vv v0, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v14, v12, v0
 ; CHECK-NEXT:    ret
   %tmp = call <vscale x 16 x i8> @llvm.sshl.sat.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y)
   ret <vscale x 16 x i8> %tmp
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
index a8934bb25571c9..081afcfab8dae6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
@@ -16,8 +16,8 @@ define <vscale x 1 x i8> @strided_vpload_nxv1i8_i8(ptr %ptr, i8 signext %stride,
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x11
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLSE8_V_MF8_MASK:%[0-9]+]]:vrnov0 = PseudoVLSE8_V_MF8_MASK $noreg, [[COPY3]], [[COPY2]], $v0, [[COPY]], 3 /* e8 */, 1 /* ta, mu */ :: (load unknown-size, align 1)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[PseudoVLSE8_V_MF8_MASK:%[0-9]+]]:vrnov0 = PseudoVLSE8_V_MF8_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 1 /* ta, mu */ :: (load unknown-size, align 1)
   ; CHECK-NEXT:   $v8 = COPY [[PseudoVLSE8_V_MF8_MASK]]
   ; CHECK-NEXT:   PseudoRET implicit $v8
   %load = call <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr %ptr, i8 %stride, <vscale x 1 x i1> %m, i32 %evl)
@@ -36,8 +36,8 @@ define void @strided_vpstore_nxv1i8_i8(<vscale x 1 x i8> %val, ptr %ptr, i8 sign
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x11
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vr = COPY $v8
-  ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   PseudoVSSE8_V_MF8_MASK [[COPY4]], [[COPY3]], [[COPY2]], $v0, [[COPY]], 3 /* e8 */ :: (store unknown-size, align 1)
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vmv0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   PseudoVSSE8_V_MF8_MASK [[COPY4]], [[COPY3]], [[COPY2]], [[COPY5]], [[COPY]], 3 /* e8 */ :: (store unknown-size, align 1)
   ; CHECK-NEXT:   PseudoRET
   call void @llvm.experimental.vp.strided.store.nxv1i8.p0.i8(<vscale x 1 x i8> %val, ptr %ptr, i8 %stride, <vscale x 1 x i1> %m, i32 %evl)
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index 0e2105d5cba861..7e4d211df5ee10 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -577,7 +577,6 @@ declare <vscale x 3 x double> @llvm.experimental.vp.strided.load.nxv3f64.p0.i32(
 define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vscale x 16 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-RV32-LABEL: strided_load_nxv16f64:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vmv1r.v v9, v0
 ; CHECK-RV32-NEXT:    csrr a4, vlenb
 ; CHECK-RV32-NEXT:    sub a2, a3, a4
 ; CHECK-RV32-NEXT:    sltu a5, a3, a2
@@ -591,7 +590,8 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:    add a5, a0, a5
 ; CHECK-RV32-NEXT:    srli a4, a4, 3
 ; CHECK-RV32-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vx v8, v9, a4
+; CHECK-RV32-NEXT:    vslidedown.vx v8, v0, a4
+; CHECK-RV32-NEXT:    vmv1r.v v9, v0
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v8
 ; CHECK-RV32-NEXT:    vlse64.v v16, (a5), a1, v0.t
@@ -602,7 +602,6 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 ;
 ; CHECK-RV64-LABEL: strided_load_nxv16f64:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    vmv1r.v v9, v0
 ; CHECK-RV64-NEXT:    csrr a4, vlenb
 ; CHECK-RV64-NEXT:    sub a3, a2, a4
 ; CHECK-RV64-NEXT:    sltu a5, a2, a3
@@ -616,7 +615,8 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:    add a5, a0, a5
 ; CHECK-RV64-NEXT:    srli a4, a4, 3
 ; CHECK-RV64-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vx v8, v9, a4
+; CHECK-RV64-NEXT:    vslidedown.vx v8, v0, a4
+; CHECK-RV64-NEXT:    vmv1r.v v9, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v8
 ; CHECK-RV64-NEXT:    vlse64.v v16, (a5), a1, v0.t
@@ -681,7 +681,7 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    csrr a2, vlenb
 ; CHECK-RV32-NEXT:    slli a7, a2, 1
-; CHECK-RV32-NEXT:    vmv1r.v v8, v0
+; CHECK-RV32-NEXT:    vmv1r.v v9, v0
 ; CHECK-RV32-NEXT:    mv a6, a3
 ; CHECK-RV32-NEXT:    bltu a3, a7, .LBB51_2
 ; CHECK-RV32-NEXT:  # %bb.1:
@@ -700,7 +700,7 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:    add t1, a0, t1
 ; CHECK-RV32-NEXT:    srli t2, a2, 3
 ; CHECK-RV32-NEXT:    vsetvli t3, zero, e8, mf4, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vx v0, v8, t2
+; CHECK-RV32-NEXT:    vslidedown.vx v0, v9, t2
 ; CHECK-RV32-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v16, (t1), a1, v0.t
 ; CHECK-RV32-NEXT:    sub a7, a3, a7
@@ -715,11 +715,12 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:    add a6, a0, a6
 ; CHECK-RV32-NEXT:    srli a2, a2, 2
 ; CHECK-RV32-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vx v0, v8, a2
+; CHECK-RV32-NEXT:    vslidedown.vx v8, v9, a2
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vmv1r.v v0, v8
 ; CHECK-RV32-NEXT:    vlse64.v v24, (a6), a1, v0.t
 ; CHECK-RV32-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-RV32-NEXT:    vmv1r.v v0, v8
+; CHECK-RV32-NEXT:    vmv1r.v v0, v9
 ; CHECK-RV32-NEXT:    vlse64.v v8, (a0), a1, v0.t
 ; CHECK-RV32-NEXT:    vs1r.v v24, (a4)
 ; CHECK-RV32-NEXT:    ret
@@ -728,7 +729,7 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    csrr a4, vlenb
 ; CHECK-RV64-NEXT:    slli a7, a4, 1
-; CHECK-RV64-NEXT:    vmv1r.v v8, v0
+; CHECK-RV64-NEXT:    vmv1r.v v9, v0
 ; CHECK-RV64-NEXT:    mv a6, a2
 ; CHECK-RV64-NEXT:    bltu a2, a7, .LBB51_2
 ; CHECK-RV64-NEXT:  # %bb.1:
@@ -747,7 +748,7 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:    add t1, a0, t1
 ; CHECK-RV64-NEXT:    srli t2, a4, 3
 ; CHECK-RV64-NEXT:    vsetvli t3, zero, e8, mf4, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vx v0, v8, t2
+; CHECK-RV64-NEXT:    vslidedown.vx v0, v9, t2
 ; CHECK-RV64-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v16, (t1), a1, v0.t
 ; CHECK-RV64-NEXT:    sub a7, a2, a7
@@ -762,11 +763,12 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:    add a6, a0, a6
 ; CHECK-RV64-NEXT:    srli a4, a4, 2
 ; CHECK-RV64-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vx v0, v8, a4
+; CHECK-RV64-NEXT:    vslidedown.vx v8, v9, a4
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV64-NEXT:    vmv1r.v v0, v8
 ; CHECK-RV64-NEXT:    vlse64.v v24, (a6), a1, v0.t
 ; CHECK-RV64-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-RV64-NEXT:    vmv1r.v v0, v8
+; CHECK-RV64-NEXT:    vmv1r.v v0, v9
 ; CHECK-RV64-NEXT:    vlse64.v v8, (a0), a1, v0.t
 ; CHECK-RV64-NEXT:    vs1r.v v24, (a3)
 ; CHECK-RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
index 9378bb3d3ca61f..ae048c8c244caa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
@@ -547,7 +547,6 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a4, vlenb
 ; CHECK-NEXT:    slli a6, a4, 1
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    mv a5, a3
 ; CHECK-NEXT:    bltu a3, a6, .LBB43_2
 ; CHECK-NEXT:  # %bb.1:
@@ -558,17 +557,8 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a7, a4
 ; CHECK-NEXT:  .LBB43_4:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr t0, vlenb
-; CHECK-NEXT:    slli t0, t0, 3
-; CHECK-NEXT:    sub sp, sp, t0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsse64.v v8, (a1), a2, v0.t
 ; CHECK-NEXT:    sub a0, a5, a4
 ; CHECK-NEXT:    sltu t0, a5, a0
@@ -578,7 +568,8 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    add a7, a1, a7
 ; CHECK-NEXT:    srli t0, a4, 3
 ; CHECK-NEXT:    vsetvli t1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, t0
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, t0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    sub a0, a3, a6
 ; CHECK-NEXT:    sltu a3, a3, a0
@@ -593,15 +584,9 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    srli a4, a4, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a4
+; CHECK-NEXT:    vslidedown.vx v0, v8, a4
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsse64.v v8, (a1), a2, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vsse64.v v24, (a1), a2, v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.experimental.vp.strided.store.nxv17f64.p0.i32(<vscale x 17 x double> %v, ptr %ptr, i32 %stride, <vscale x 17 x i1> %mask, i32 %evl)
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll
index 68e0c0089d0c78..a5dd27149c1f27 100644
--- a/llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll
@@ -7,10 +7,10 @@ define <vscale x 1 x i8> @umulo_nxv1i8(<vscale x 1 x i8> %x, <vscale x 1 x i8> %
 ; CHECK-LABEL: umulo_nxv1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 1 x i8>, <vscale x 1 x i1> } @llvm.umul.with.overflow.nxv1i8(<vscale x 1 x i8> %x, <vscale x 1 x i8> %y)
   %b = extractvalue { <vscale x 1 x i8>, <vscale x 1 x i1> } %a, 0
@@ -25,10 +25,10 @@ define <vscale x 2 x i8> @umulo_nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %
 ; CHECK-LABEL: umulo_nxv2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 2 x i8>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y)
   %b = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i1> } %a, 0
@@ -43,10 +43,10 @@ define <vscale x 4 x i8> @umulo_nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %
 ; CHECK-LABEL: umulo_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 4 x i8>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %y)
   %b = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i1> } %a, 0
@@ -61,10 +61,10 @@ define <vscale x 8 x i8> @umulo_nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %
 ; CHECK-LABEL: umulo_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i8>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y)
   %b = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i1> } %a, 0
@@ -79,10 +79,10 @@ define <vscale x 16 x i8> @umulo_nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i
 ; CHECK-LABEL: umulo_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v12, v8, v10
-; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v12, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v10
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 16 x i8>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y)
   %b = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i1> } %a, 0
@@ -97,10 +97,10 @@ define <vscale x 32 x i8> @umulo_nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i
 ; CHECK-LABEL: umulo_nxv32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmulhu.vv v16, v8, v12
-; CHECK-NEXT:    vmsne.vi v0, v16, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v16, v8, v12
+; CHECK-NEXT:    vmulhu.vv v8, v8, v12
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v16, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 32 x i8>, <vscale x 32 x i1> } @llvm.umul.with.overflow.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %y)
   %b = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i1> } %a, 0
@@ -115,10 +115,10 @@ define <vscale x 64 x i8> @umulo_nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i
 ; CHECK-LABEL: umulo_nxv64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmulhu.vv v24, v8, v16
-; CHECK-NEXT:    vmsne.vi v0, v24, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v24, v8, v16
+; CHECK-NEXT:    vmulhu.vv v8, v8, v16
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v24, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 64 x i8>, <vscale x 64 x i1> } @llvm.umul.with.overflow.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %y)
   %b = extractvalue { <vscale x 64 x i8>, <vscale x 64 x i1> } %a, 0
@@ -133,10 +133,10 @@ define <vscale x 1 x i16> @umulo_nxv1i16(<vscale x 1 x i16> %x, <vscale x 1 x i1
 ; CHECK-LABEL: umulo_nxv1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 1 x i16>, <vscale x 1 x i1> } @llvm.umul.with.overflow.nxv1i16(<vscale x 1 x i16> %x, <vscale x 1 x i16> %y)
   %b = extractvalue { <vscale x 1 x i16>, <vscale x 1 x i1> } %a, 0
@@ -151,10 +151,10 @@ define <vscale x 2 x i16> @umulo_nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i1
 ; CHECK-LABEL: umulo_nxv2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 2 x i16>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y)
   %b = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i1> } %a, 0
@@ -169,10 +169,10 @@ define <vscale x 4 x i16> @umulo_nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i1
 ; CHECK-LABEL: umulo_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 4 x i16>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y)
   %b = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i1> } %a, 0
@@ -187,10 +187,10 @@ define <vscale x 8 x i16> @umulo_nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1
 ; CHECK-LABEL: umulo_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v12, v8, v10
-; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v12, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v10
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i16>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y)
   %b = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i1> } %a, 0
@@ -205,10 +205,10 @@ define <vscale x 16 x i16> @umulo_nxv16i16(<vscale x 16 x i16> %x, <vscale x 16
 ; CHECK-LABEL: umulo_nxv16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmulhu.vv v16, v8, v12
-; CHECK-NEXT:    vmsne.vi v0, v16, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v16, v8, v12
+; CHECK-NEXT:    vmulhu.vv v8, v8, v12
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v16, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 16 x i16>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %y)
   %b = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i1> } %a, 0
@@ -223,10 +223,10 @@ define <vscale x 32 x i16> @umulo_nxv32i16(<vscale x 32 x i16> %x, <vscale x 32
 ; CHECK-LABEL: umulo_nxv32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmulhu.vv v24, v8, v16
-; CHECK-NEXT:    vmsne.vi v0, v24, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v24, v8, v16
+; CHECK-NEXT:    vmulhu.vv v8, v8, v16
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v24, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 32 x i16>, <vscale x 32 x i1> } @llvm.umul.with.overflow.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %y)
   %b = extractvalue { <vscale x 32 x i16>, <vscale x 32 x i1> } %a, 0
@@ -241,10 +241,10 @@ define <vscale x 1 x i32> @umulo_nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x i3
 ; CHECK-LABEL: umulo_nxv1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 1 x i32>, <vscale x 1 x i1> } @llvm.umul.with.overflow.nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x i32> %y)
   %b = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i1> } %a, 0
@@ -259,10 +259,10 @@ define <vscale x 2 x i32> @umulo_nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i3
 ; CHECK-LABEL: umulo_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 2 x i32>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y)
   %b = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i1> } %a, 0
@@ -277,10 +277,10 @@ define <vscale x 4 x i32> @umulo_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i3
 ; CHECK-LABEL: umulo_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v12, v8, v10
-; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v12, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v10
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 4 x i32>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y)
   %b = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } %a, 0
@@ -295,10 +295,10 @@ define <vscale x 8 x i32> @umulo_nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i3
 ; CHECK-LABEL: umulo_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmulhu.vv v16, v8, v12
-; CHECK-NEXT:    vmsne.vi v0, v16, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v16, v8, v12
+; CHECK-NEXT:    vmulhu.vv v8, v8, v12
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v16, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i32>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y)
   %b = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i1> } %a, 0
@@ -313,10 +313,10 @@ define <vscale x 16 x i32> @umulo_nxv16i32(<vscale x 16 x i32> %x, <vscale x 16
 ; CHECK-LABEL: umulo_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmulhu.vv v24, v8, v16
-; CHECK-NEXT:    vmsne.vi v0, v24, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v24, v8, v16
+; CHECK-NEXT:    vmulhu.vv v8, v8, v16
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v24, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 16 x i32>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y)
   %b = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i1> } %a, 0
@@ -331,10 +331,10 @@ define <vscale x 1 x i64> @umulo_nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i6
 ; CHECK-LABEL: umulo_nxv1i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 1 x i64>, <vscale x 1 x i1> } @llvm.umul.with.overflow.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %y)
   %b = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i1> } %a, 0
@@ -349,10 +349,10 @@ define <vscale x 2 x i64> @umulo_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i6
 ; CHECK-LABEL: umulo_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v12, v8, v10
-; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v12, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v10
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 2 x i64>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y)
   %b = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i1> } %a, 0
@@ -367,10 +367,10 @@ define <vscale x 4 x i64> @umulo_nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i6
 ; CHECK-LABEL: umulo_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmulhu.vv v16, v8, v12
-; CHECK-NEXT:    vmsne.vi v0, v16, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v16, v8, v12
+; CHECK-NEXT:    vmulhu.vv v8, v8, v12
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v16, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 4 x i64>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %y)
   %b = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i1> } %a, 0
@@ -385,10 +385,10 @@ define <vscale x 8 x i64> @umulo_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i6
 ; CHECK-LABEL: umulo_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmulhu.vv v24, v8, v16
-; CHECK-NEXT:    vmsne.vi v0, v24, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v24, v8, v16
+; CHECK-NEXT:    vmulhu.vv v8, v8, v16
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v24, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i64>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y)
   %b = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i1> } %a, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
index 4b5e737d22eb83..cf733226fe8cf2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -565,7 +565,7 @@ declare <vscale x 128 x i8> @llvm.vp.add.nxv128i8(<vscale x 128 x i8>, <vscale x
 define <vscale x 128 x i8> @vadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vadd_vi_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -581,7 +581,7 @@ define <vscale x 128 x i8> @vadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:  .LBB50_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vadd.vi v8, v8, -1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 128 x i8> @llvm.vp.add.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> splat (i8 -1), <vscale x 128 x i1> %m, i32 %evl)
@@ -1343,10 +1343,10 @@ declare <vscale x 32 x i32> @llvm.vp.add.nxv32i32(<vscale x 32 x i32>, <vscale x
 define <vscale x 32 x i32> @vadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vadd_vi_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -1360,7 +1360,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB118_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vadd.vi v8, v8, -1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i32> @llvm.vp.add.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> splat (i32 -1), <vscale x 32 x i1> %m, i32 %evl)
@@ -1399,10 +1399,10 @@ declare i32 @llvm.vscale.i32()
 define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m) {
 ; CHECK-LABEL: vadd_vi_nxv32i32_evl_nx8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a1
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -1416,7 +1416,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, <v
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB120_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vadd.vi v8, v8, -1, v0.t
 ; CHECK-NEXT:    ret
   %evl = call i32 @llvm.vscale.i32()
diff --git a/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll
index 939a45e15c1037..6cc4a094b50e44 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll
@@ -245,7 +245,6 @@ define <vscale x 32 x half> @vfsgnj_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -254,6 +253,7 @@ define <vscale x 32 x half> @vfsgnj_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 6a712080fda74a..ad2caa826a0257 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -13,11 +13,12 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_load_nxv16i
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a0
+; CHECK-NEXT:    vmv1r.v v11, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v14, v8, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v12, v8, 1, v0
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 0
 ; CHECK-NEXT:    vmsne.vi v8, v10, 0
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 8
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index d98597fabcd953..58b467fc753597 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -90,25 +90,38 @@ declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deint
 define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv128i1(<vscale x 128 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv64i1_nxv128i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v28, v8
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT:    vmv1r.v v12, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
+; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmerge.vim v16, v24, 1, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v24, v16, 0
+; CHECK-NEXT:    vnsrl.wi v8, v16, 0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v28
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vim v24, v24, 1, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v28, v8, 0
+; CHECK-NEXT:    vnsrl.wi v12, v24, 0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmsne.vi v7, v24, 0
+; CHECK-NEXT:    vmsne.vi v7, v8, 0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs1r.v v7, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v24, v16, 8
-; CHECK-NEXT:    vnsrl.wi v28, v8, 8
+; CHECK-NEXT:    vnsrl.wi v0, v16, 8
+; CHECK-NEXT:    vnsrl.wi v4, v24, 8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmsne.vi v9, v24, 0
-; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    vmsne.vi v9, v0, 0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.experimental.vector.deinterleave2.nxv128i1(<vscale x 128 x i1> %vec)
 ret {<vscale x 64 x i1>, <vscale x 64 x i1>} %retval
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index 9a5e86d61c265e..baad9e16535b01 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -9,16 +9,16 @@ define void @vector_interleave_store_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v10, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vwaddu.vv v16, v8, v12
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT:    vwaddu.vv v12, v8, v10
 ; CHECK-NEXT:    li a1, -1
-; CHECK-NEXT:    vwmaccu.vx v16, a1, v12
-; CHECK-NEXT:    vmsne.vi v8, v18, 0
-; CHECK-NEXT:    vmsne.vi v9, v16, 0
+; CHECK-NEXT:    vwmaccu.vx v12, a1, v10
+; CHECK-NEXT:    vmsne.vi v8, v14, 0
+; CHECK-NEXT:    vmsne.vi v9, v12, 0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a2, a1, a1
@@ -107,14 +107,14 @@ define void @vector_interleave_store_nxv16i64_nxv8i64(<vscale x 8 x i64> %a, <vs
 ; CHECK-NEXT:    srli a2, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vid.v v24
-; CHECK-NEXT:    vand.vi v26, v24, 1
-; CHECK-NEXT:    vmsne.vi v28, v26, 0
-; CHECK-NEXT:    vsrl.vi v24, v24, 1
+; CHECK-NEXT:    vsrl.vi v26, v24, 1
+; CHECK-NEXT:    vand.vi v24, v24, 1
+; CHECK-NEXT:    vmsne.vi v28, v24, 0
 ; CHECK-NEXT:    vmv1r.v v0, v28
-; CHECK-NEXT:    vadd.vx v24, v24, a2, v0.t
+; CHECK-NEXT:    vadd.vx v26, v26, a2, v0.t
 ; CHECK-NEXT:    vmv4r.v v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v0, v8, v24
+; CHECK-NEXT:    vrgatherei16.vv v0, v8, v26
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
@@ -123,7 +123,7 @@ define void @vector_interleave_store_nxv16i64_nxv8i64(<vscale x 8 x i64> %a, <vs
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmv4r.v v16, v12
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24
+; CHECK-NEXT:    vrgatherei16.vv v8, v16, v26
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    vs8r.v v8, (a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index 4b6ad0f27214d0..c45448324e550c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -11,16 +11,16 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v10, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vwaddu.vv v16, v8, v12
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT:    vwaddu.vv v12, v8, v10
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v16, a0, v12
-; CHECK-NEXT:    vmsne.vi v8, v18, 0
-; CHECK-NEXT:    vmsne.vi v0, v16, 0
+; CHECK-NEXT:    vwmaccu.vx v12, a0, v10
+; CHECK-NEXT:    vmsne.vi v8, v14, 0
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
@@ -30,14 +30,14 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ;
 ; ZVBB-LABEL: vector_interleave_nxv32i1_nxv16i1:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vmv1r.v v9, v0
+; ZVBB-NEXT:    vmv1r.v v16, v0
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, mu
 ; ZVBB-NEXT:    vmv.v.i v10, 0
 ; ZVBB-NEXT:    vmv1r.v v0, v8
-; ZVBB-NEXT:    vmerge.vim v10, v10, 1, v0
-; ZVBB-NEXT:    vwsll.vi v12, v10, 8
+; ZVBB-NEXT:    vmerge.vim v8, v10, 1, v0
+; ZVBB-NEXT:    vwsll.vi v12, v8, 8
 ; ZVBB-NEXT:    li a0, 1
-; ZVBB-NEXT:    vmv1r.v v0, v9
+; ZVBB-NEXT:    vmv1r.v v0, v16
 ; ZVBB-NEXT:    vwaddu.wx v12, v12, a0, v0.t
 ; ZVBB-NEXT:    vmsne.vi v8, v14, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v12, 0
@@ -122,9 +122,9 @@ define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64>
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vand.vi v13, v12, 1
-; CHECK-NEXT:    vmsne.vi v0, v13, 0
 ; CHECK-NEXT:    vsrl.vi v16, v12, 1
+; CHECK-NEXT:    vand.vi v12, v12, 1
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    vadd.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
@@ -137,9 +137,9 @@ define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64>
 ; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
 ; ZVBB-NEXT:    vid.v v12
-; ZVBB-NEXT:    vand.vi v13, v12, 1
-; ZVBB-NEXT:    vmsne.vi v0, v13, 0
 ; ZVBB-NEXT:    vsrl.vi v16, v12, 1
+; ZVBB-NEXT:    vand.vi v12, v12, 1
+; ZVBB-NEXT:    vmsne.vi v0, v12, 0
 ; ZVBB-NEXT:    vadd.vx v16, v16, a0, v0.t
 ; ZVBB-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
@@ -158,41 +158,42 @@ declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale
 define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b) {
 ; CHECK-LABEL: vector_interleave_nxv128i1_nxv64i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v16, v24, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vwaddu.vv v24, v8, v16
+; CHECK-NEXT:    vwaddu.vv v24, v16, v8
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v24, a0, v16
+; CHECK-NEXT:    vwmaccu.vx v24, a0, v8
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmsne.vi v0, v24, 0
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vwaddu.vv v24, v12, v20
-; CHECK-NEXT:    vwmaccu.vx v24, a0, v20
+; CHECK-NEXT:    vwaddu.vv v24, v20, v12
+; CHECK-NEXT:    vwmaccu.vx v24, a0, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmsne.vi v8, v24, 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv128i1_nxv64i1:
 ; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vmv1r.v v24, v8
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; ZVBB-NEXT:    vmv.v.i v24, 0
-; ZVBB-NEXT:    vmerge.vim v16, v24, 1, v0
-; ZVBB-NEXT:    vmv1r.v v0, v8
-; ZVBB-NEXT:    vmerge.vim v8, v24, 1, v0
+; ZVBB-NEXT:    vmv.v.i v16, 0
+; ZVBB-NEXT:    vmerge.vim v8, v16, 1, v0
+; ZVBB-NEXT:    vmv1r.v v0, v24
+; ZVBB-NEXT:    vmerge.vim v16, v16, 1, v0
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVBB-NEXT:    vwsll.vi v24, v8, 8
-; ZVBB-NEXT:    vwaddu.wv v24, v24, v16
+; ZVBB-NEXT:    vwsll.vi v24, v16, 8
+; ZVBB-NEXT:    vwaddu.wv v24, v24, v8
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v0, v24, 0
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVBB-NEXT:    vwsll.vi v24, v12, 8
-; ZVBB-NEXT:    vwaddu.wv v24, v24, v20
+; ZVBB-NEXT:    vwsll.vi v24, v20, 8
+; ZVBB-NEXT:    vwaddu.wv v24, v24, v12
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v8, v24, 0
 ; ZVBB-NEXT:    ret
@@ -296,9 +297,9 @@ define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv8i64(<vscale x 8 x i64
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vid.v v24
-; CHECK-NEXT:    vand.vi v26, v24, 1
-; CHECK-NEXT:    vmsne.vi v10, v26, 0
 ; CHECK-NEXT:    vsrl.vi v8, v24, 1
+; CHECK-NEXT:    vand.vi v24, v24, 1
+; CHECK-NEXT:    vmsne.vi v10, v24, 0
 ; CHECK-NEXT:    vmv8r.v v24, v0
 ; CHECK-NEXT:    vmv4r.v v12, v4
 ; CHECK-NEXT:    vmv1r.v v0, v10
@@ -331,9 +332,9 @@ define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv8i64(<vscale x 8 x i64
 ; ZVBB-NEXT:    srli a0, a0, 1
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
 ; ZVBB-NEXT:    vid.v v24
-; ZVBB-NEXT:    vand.vi v26, v24, 1
-; ZVBB-NEXT:    vmsne.vi v10, v26, 0
 ; ZVBB-NEXT:    vsrl.vi v8, v24, 1
+; ZVBB-NEXT:    vand.vi v24, v24, 1
+; ZVBB-NEXT:    vmsne.vi v10, v24, 0
 ; ZVBB-NEXT:    vmv8r.v v24, v0
 ; ZVBB-NEXT:    vmv4r.v v12, v4
 ; ZVBB-NEXT:    vmv1r.v v0, v10
@@ -492,9 +493,9 @@ define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x do
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vand.vi v13, v12, 1
-; CHECK-NEXT:    vmsne.vi v0, v13, 0
 ; CHECK-NEXT:    vsrl.vi v16, v12, 1
+; CHECK-NEXT:    vand.vi v12, v12, 1
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    vadd.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
@@ -507,9 +508,9 @@ define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x do
 ; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
 ; ZVBB-NEXT:    vid.v v12
-; ZVBB-NEXT:    vand.vi v13, v12, 1
-; ZVBB-NEXT:    vmsne.vi v0, v13, 0
 ; ZVBB-NEXT:    vsrl.vi v16, v12, 1
+; ZVBB-NEXT:    vand.vi v12, v12, 1
+; ZVBB-NEXT:    vmsne.vi v0, v12, 0
 ; ZVBB-NEXT:    vadd.vx v16, v16, a0, v0.t
 ; ZVBB-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
@@ -596,9 +597,9 @@ define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vid.v v24
-; CHECK-NEXT:    vand.vi v26, v24, 1
-; CHECK-NEXT:    vmsne.vi v10, v26, 0
 ; CHECK-NEXT:    vsrl.vi v8, v24, 1
+; CHECK-NEXT:    vand.vi v24, v24, 1
+; CHECK-NEXT:    vmsne.vi v10, v24, 0
 ; CHECK-NEXT:    vmv8r.v v24, v0
 ; CHECK-NEXT:    vmv4r.v v12, v4
 ; CHECK-NEXT:    vmv1r.v v0, v10
@@ -631,9 +632,9 @@ define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x
 ; ZVBB-NEXT:    srli a0, a0, 1
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
 ; ZVBB-NEXT:    vid.v v24
-; ZVBB-NEXT:    vand.vi v26, v24, 1
-; ZVBB-NEXT:    vmsne.vi v10, v26, 0
 ; ZVBB-NEXT:    vsrl.vi v8, v24, 1
+; ZVBB-NEXT:    vand.vi v24, v24, 1
+; ZVBB-NEXT:    vmsne.vi v10, v24, 0
 ; ZVBB-NEXT:    vmv8r.v v24, v0
 ; ZVBB-NEXT:    vmv4r.v v12, v4
 ; ZVBB-NEXT:    vmv1r.v v0, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
index c98242437f6291..cb056276b24cf4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
@@ -9,19 +9,19 @@ declare <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x
 define <vscale x 1 x i1> @splice_nxv1i1_offset_negone(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i1_offset_negone:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vslidedown.vx v9, v9, a0
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 -1)
@@ -31,21 +31,21 @@ define <vscale x 1 x i1> @splice_nxv1i1_offset_negone(<vscale x 1 x i1> %a, <vsc
 define <vscale x 1 x i1> @splice_nxv1i1_offset_max(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i1_offset_max:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v9, 1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a0
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 1)
@@ -57,19 +57,19 @@ declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x
 define <vscale x 2 x i1> @splice_nxv2i1_offset_negone(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i1_offset_negone:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vslidedown.vx v9, v9, a0
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
@@ -79,21 +79,21 @@ define <vscale x 2 x i1> @splice_nxv2i1_offset_negone(<vscale x 2 x i1> %a, <vsc
 define <vscale x 2 x i1> @splice_nxv2i1_offset_max(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i1_offset_max:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a0, a0, -3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v9, 3
+; CHECK-NEXT:    vslidedown.vi v8, v8, 3
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a0
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 3)
@@ -105,19 +105,19 @@ declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x
 define <vscale x 4 x i1> @splice_nxv4i1_offset_negone(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i1_offset_negone:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vslidedown.vx v9, v9, a0
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
@@ -127,21 +127,21 @@ define <vscale x 4 x i1> @splice_nxv4i1_offset_negone(<vscale x 4 x i1> %a, <vsc
 define <vscale x 4 x i1> @splice_nxv4i1_offset_max(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i1_offset_max:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -7
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v9, 7
+; CHECK-NEXT:    vslidedown.vi v8, v8, 7
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a0
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 7)
@@ -153,18 +153,18 @@ declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x
 define <vscale x 8 x i1> @splice_nxv8i1_offset_negone(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i1_offset_negone:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vslidedown.vx v9, v9, a0
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
@@ -174,20 +174,20 @@ define <vscale x 8 x i1> @splice_nxv8i1_offset_negone(<vscale x 8 x i1> %a, <vsc
 define <vscale x 8 x i1> @splice_nxv8i1_offset_max(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i1_offset_max:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    addi a0, a0, -15
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v9, 15
+; CHECK-NEXT:    vslidedown.vi v8, v8, 15
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a0
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 15)
@@ -201,18 +201,18 @@ define <vscale x 16 x i1> @splice_nxv16i1_offset_negone(<vscale x 16 x i1> %a, <
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v10, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 1
+; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
@@ -225,18 +225,18 @@ define <vscale x 16 x i1> @splice_nxv16i1_offset_max(<vscale x 16 x i1> %a, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v10, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -31
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v12, a0
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
@@ -251,18 +251,18 @@ define <vscale x 32 x i1> @splice_nxv32i1_offset_negone(<vscale x 32 x i1> %a, <
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT:    vmerge.vim v12, v16, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 1
+; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
@@ -274,19 +274,19 @@ define <vscale x 32 x i1> @splice_nxv32i1_offset_max(<vscale x 32 x i1> %a, <vsc
 ; CHECK-LABEL: splice_nxv32i1_offset_max:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmerge.vim v12, v16, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    addi a0, a0, -63
 ; CHECK-NEXT:    li a1, 63
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v16, a1
+; CHECK-NEXT:    vslidedown.vx v12, v12, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
-; CHECK-NEXT:    vslideup.vx v16, v8, a0
-; CHECK-NEXT:    vand.vi v8, v16, 1
+; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    vslideup.vx v12, v8, a0
+; CHECK-NEXT:    vand.vi v8, v12, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 32 x i1> @llvm.experimental.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 63)
@@ -298,21 +298,21 @@ declare <vscale x 64 x i1> @llvm.experimental.vector.splice.nxv64i1(<vscale x 64
 define <vscale x 64 x i1> @splice_nxv64i1_offset_negone(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv64i1_offset_negone:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v16, v24, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslidedown.vx v16, v16, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 1
-; CHECK-NEXT:    vand.vi v8, v8, 1
+; CHECK-NEXT:    vslideup.vi v16, v8, 1
+; CHECK-NEXT:    vand.vi v8, v16, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 64 x i1> @llvm.experimental.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 -1)
@@ -323,19 +323,19 @@ define <vscale x 64 x i1> @splice_nxv64i1_offset_max(<vscale x 64 x i1> %a, <vsc
 ; CHECK-LABEL: splice_nxv64i1_offset_max:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmerge.vim v24, v16, 1, v0
+; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmerge.vim v16, v24, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    addi a0, a0, -127
 ; CHECK-NEXT:    li a1, 127
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v24, v24, a1
+; CHECK-NEXT:    vslidedown.vx v16, v16, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
-; CHECK-NEXT:    vslideup.vx v24, v8, a0
-; CHECK-NEXT:    vand.vi v8, v24, 1
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
+; CHECK-NEXT:    vslideup.vx v16, v8, a0
+; CHECK-NEXT:    vand.vi v8, v16, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 64 x i1> @llvm.experimental.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 127)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
index df2bc523cd7a83..35488d1e0d0b8a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
@@ -229,7 +229,6 @@ define <vscale x 32 x half> @vfabs_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -238,21 +237,22 @@ define <vscale x 32 x half> @vfabs_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v24, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfabs.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB10_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
+; ZVFHMIN-NEXT:    vfabs.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -526,10 +526,10 @@ declare <vscale x 16 x double> @llvm.vp.fabs.nxv16f64(<vscale x 16 x double>, <v
 define <vscale x 16 x double> @vfabs_vv_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfabs_vv_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
@@ -542,7 +542,7 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfabs.v v8, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.fabs.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index c69a7bc5cece47..10d34457c3da48 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -569,7 +569,6 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -578,6 +577,7 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
@@ -675,12 +675,11 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ;
 ; ZVFHMIN-LABEL: vfadd_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v24, fa5
+; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -689,22 +688,23 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v3, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfadd.vv v24, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB24_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB24_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll
index 1cc9ea029d4574..15849fd2e277e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll
@@ -1944,10 +1944,10 @@ define <vscale x 8 x i1> @fcmp_ogt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -1960,10 +1960,10 @@ define <vscale x 8 x i1> @fcmp_ogt_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -1978,10 +1978,10 @@ define <vscale x 8 x i1> @fcmp_ogt_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -1995,10 +1995,10 @@ define <vscale x 8 x i1> @fcmp_oge_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -2011,10 +2011,10 @@ define <vscale x 8 x i1> @fcmp_oge_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2029,10 +2029,10 @@ define <vscale x 8 x i1> @fcmp_oge_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2046,10 +2046,10 @@ define <vscale x 8 x i1> @fcmp_olt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -2062,10 +2062,10 @@ define <vscale x 8 x i1> @fcmp_olt_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2080,10 +2080,10 @@ define <vscale x 8 x i1> @fcmp_olt_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2097,10 +2097,10 @@ define <vscale x 8 x i1> @fcmp_ole_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -2113,10 +2113,10 @@ define <vscale x 8 x i1> @fcmp_ole_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2131,10 +2131,10 @@ define <vscale x 8 x i1> @fcmp_ole_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2148,12 +2148,12 @@ define <vscale x 8 x i1> @fcmp_one_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -2166,12 +2166,12 @@ define <vscale x 8 x i1> @fcmp_one_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2186,12 +2186,12 @@ define <vscale x 8 x i1> @fcmp_one_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2247,12 +2247,12 @@ define <vscale x 8 x i1> @fcmp_ueq_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -2265,12 +2265,12 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2285,12 +2285,12 @@ define <vscale x 8 x i1> @fcmp_ueq_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2304,8 +2304,8 @@ define <vscale x 8 x i1> @fcmp_ugt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -2320,8 +2320,8 @@ define <vscale x 8 x i1> @fcmp_ugt_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -2338,8 +2338,8 @@ define <vscale x 8 x i1> @fcmp_ugt_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -2355,8 +2355,8 @@ define <vscale x 8 x i1> @fcmp_uge_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -2371,8 +2371,8 @@ define <vscale x 8 x i1> @fcmp_uge_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -2389,8 +2389,8 @@ define <vscale x 8 x i1> @fcmp_uge_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -2406,8 +2406,8 @@ define <vscale x 8 x i1> @fcmp_ult_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vv v12, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -2422,8 +2422,8 @@ define <vscale x 8 x i1> @fcmp_ult_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -2440,8 +2440,8 @@ define <vscale x 8 x i1> @fcmp_ult_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -2457,8 +2457,8 @@ define <vscale x 8 x i1> @fcmp_ule_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -2473,8 +2473,8 @@ define <vscale x 8 x i1> @fcmp_ule_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -2491,8 +2491,8 @@ define <vscale x 8 x i1> @fcmp_ule_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -2619,10 +2619,10 @@ define <vscale x 16 x i1> @fcmp_ogt_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i1> @llvm.experimental.constrained.fcmp.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 16 x i1> %1
@@ -2635,10 +2635,10 @@ define <vscale x 16 x i1> @fcmp_ogt_vf_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2653,10 +2653,10 @@ define <vscale x 16 x i1> @fcmp_ogt_fv_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2670,10 +2670,10 @@ define <vscale x 16 x i1> @fcmp_oge_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i1> @llvm.experimental.constrained.fcmp.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <vscale x 16 x i1> %1
@@ -2686,10 +2686,10 @@ define <vscale x 16 x i1> @fcmp_oge_vf_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2704,10 +2704,10 @@ define <vscale x 16 x i1> @fcmp_oge_fv_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2721,10 +2721,10 @@ define <vscale x 16 x i1> @fcmp_olt_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i1> @llvm.experimental.constrained.fcmp.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 16 x i1> %1
@@ -2737,10 +2737,10 @@ define <vscale x 16 x i1> @fcmp_olt_vf_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2755,10 +2755,10 @@ define <vscale x 16 x i1> @fcmp_olt_fv_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2772,10 +2772,10 @@ define <vscale x 16 x i1> @fcmp_ole_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vv v17, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i1> @llvm.experimental.constrained.fcmp.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <vscale x 16 x i1> %1
@@ -2788,10 +2788,10 @@ define <vscale x 16 x i1> @fcmp_ole_vf_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2806,10 +2806,10 @@ define <vscale x 16 x i1> @fcmp_ole_fv_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2823,12 +2823,12 @@ define <vscale x 16 x i1> @fcmp_one_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i1> @llvm.experimental.constrained.fcmp.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <vscale x 16 x i1> %1
@@ -2841,12 +2841,12 @@ define <vscale x 16 x i1> @fcmp_one_vf_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2861,12 +2861,12 @@ define <vscale x 16 x i1> @fcmp_one_fv_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2922,12 +2922,12 @@ define <vscale x 16 x i1> @fcmp_ueq_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i1> @llvm.experimental.constrained.fcmp.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <vscale x 16 x i1> %1
@@ -2940,12 +2940,12 @@ define <vscale x 16 x i1> @fcmp_ueq_vf_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2960,12 +2960,12 @@ define <vscale x 16 x i1> @fcmp_ueq_fv_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2979,8 +2979,8 @@ define <vscale x 16 x i1> @fcmp_ugt_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -2995,8 +2995,8 @@ define <vscale x 16 x i1> @fcmp_ugt_vf_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3013,8 +3013,8 @@ define <vscale x 16 x i1> @fcmp_ugt_fv_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3030,8 +3030,8 @@ define <vscale x 16 x i1> @fcmp_uge_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3046,8 +3046,8 @@ define <vscale x 16 x i1> @fcmp_uge_vf_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3064,8 +3064,8 @@ define <vscale x 16 x i1> @fcmp_uge_fv_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3081,8 +3081,8 @@ define <vscale x 16 x i1> @fcmp_ult_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3097,8 +3097,8 @@ define <vscale x 16 x i1> @fcmp_ult_vf_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3115,8 +3115,8 @@ define <vscale x 16 x i1> @fcmp_ult_fv_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3132,8 +3132,8 @@ define <vscale x 16 x i1> @fcmp_ule_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3148,8 +3148,8 @@ define <vscale x 16 x i1> @fcmp_ule_vf_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3166,8 +3166,8 @@ define <vscale x 16 x i1> @fcmp_ule_fv_nxv16f16(<vscale x 16 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -3294,10 +3294,10 @@ define <vscale x 32 x i1> @fcmp_ogt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmflt.vv v24, v16, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vv v25, v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 32 x i1> @llvm.experimental.constrained.fcmp.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 32 x i1> %1
@@ -3310,10 +3310,10 @@ define <vscale x 32 x i1> @fcmp_ogt_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3328,10 +3328,10 @@ define <vscale x 32 x i1> @fcmp_ogt_fv_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3345,10 +3345,10 @@ define <vscale x 32 x i1> @fcmp_oge_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmfle.vv v24, v16, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmfle.vv v25, v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 32 x i1> @llvm.experimental.constrained.fcmp.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <vscale x 32 x i1> %1
@@ -3361,10 +3361,10 @@ define <vscale x 32 x i1> @fcmp_oge_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfge.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfge.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3379,10 +3379,10 @@ define <vscale x 32 x i1> @fcmp_oge_fv_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3396,10 +3396,10 @@ define <vscale x 32 x i1> @fcmp_olt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmflt.vv v24, v8, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vv v25, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 32 x i1> @llvm.experimental.constrained.fcmp.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 32 x i1> %1
@@ -3412,10 +3412,10 @@ define <vscale x 32 x i1> @fcmp_olt_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3430,10 +3430,10 @@ define <vscale x 32 x i1> @fcmp_olt_fv_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3447,10 +3447,10 @@ define <vscale x 32 x i1> @fcmp_ole_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmfle.vv v24, v8, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmfle.vv v25, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 32 x i1> @llvm.experimental.constrained.fcmp.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <vscale x 32 x i1> %1
@@ -3463,10 +3463,10 @@ define <vscale x 32 x i1> @fcmp_ole_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3481,10 +3481,10 @@ define <vscale x 32 x i1> @fcmp_ole_fv_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfge.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfge.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3498,12 +3498,12 @@ define <vscale x 32 x i1> @fcmp_one_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmflt.vv v25, v8, v16, v0.t
-; CHECK-NEXT:    vmflt.vv v24, v16, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v24, v25
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmflt.vv v24, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vv v25, v16, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v25, v24
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 32 x i1> @llvm.experimental.constrained.fcmp.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <vscale x 32 x i1> %1
@@ -3516,12 +3516,12 @@ define <vscale x 32 x i1> @fcmp_one_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3536,12 +3536,12 @@ define <vscale x 32 x i1> @fcmp_one_fv_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3597,12 +3597,12 @@ define <vscale x 32 x i1> @fcmp_ueq_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmflt.vv v25, v8, v16, v0.t
-; CHECK-NEXT:    vmflt.vv v24, v16, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v24, v25
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmflt.vv v24, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vv v25, v16, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v25, v24
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 32 x i1> @llvm.experimental.constrained.fcmp.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <vscale x 32 x i1> %1
@@ -3615,12 +3615,12 @@ define <vscale x 32 x i1> @fcmp_ueq_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3635,12 +3635,12 @@ define <vscale x 32 x i1> @fcmp_ueq_fv_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3654,8 +3654,8 @@ define <vscale x 32 x i1> @fcmp_ugt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmfle.vv v24, v8, v16, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v24
 ; CHECK-NEXT:    ret
@@ -3670,8 +3670,8 @@ define <vscale x 32 x i1> @fcmp_ugt_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3688,8 +3688,8 @@ define <vscale x 32 x i1> @fcmp_ugt_fv_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfge.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3705,8 +3705,8 @@ define <vscale x 32 x i1> @fcmp_uge_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmflt.vv v24, v8, v16, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v24
 ; CHECK-NEXT:    ret
@@ -3721,8 +3721,8 @@ define <vscale x 32 x i1> @fcmp_uge_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3739,8 +3739,8 @@ define <vscale x 32 x i1> @fcmp_uge_fv_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3756,8 +3756,8 @@ define <vscale x 32 x i1> @fcmp_ult_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmfle.vv v24, v16, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v24
 ; CHECK-NEXT:    ret
@@ -3772,8 +3772,8 @@ define <vscale x 32 x i1> @fcmp_ult_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfge.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3790,8 +3790,8 @@ define <vscale x 32 x i1> @fcmp_ult_fv_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3807,8 +3807,8 @@ define <vscale x 32 x i1> @fcmp_ule_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmflt.vv v24, v16, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v24
 ; CHECK-NEXT:    ret
@@ -3823,8 +3823,8 @@ define <vscale x 32 x i1> @fcmp_ule_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -3841,8 +3841,8 @@ define <vscale x 32 x i1> @fcmp_ule_fv_nxv32f16(<vscale x 32 x half> %va, half %
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -5235,10 +5235,10 @@ define <vscale x 4 x i1> @fcmp_ogt_vv_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.experimental.constrained.fcmp.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 4 x i1> %1
@@ -5251,10 +5251,10 @@ define <vscale x 4 x i1> @fcmp_ogt_vf_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5269,10 +5269,10 @@ define <vscale x 4 x i1> @fcmp_ogt_fv_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5286,10 +5286,10 @@ define <vscale x 4 x i1> @fcmp_oge_vv_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.experimental.constrained.fcmp.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <vscale x 4 x i1> %1
@@ -5302,10 +5302,10 @@ define <vscale x 4 x i1> @fcmp_oge_vf_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5320,10 +5320,10 @@ define <vscale x 4 x i1> @fcmp_oge_fv_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5337,10 +5337,10 @@ define <vscale x 4 x i1> @fcmp_olt_vv_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.experimental.constrained.fcmp.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 4 x i1> %1
@@ -5353,10 +5353,10 @@ define <vscale x 4 x i1> @fcmp_olt_vf_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5371,10 +5371,10 @@ define <vscale x 4 x i1> @fcmp_olt_fv_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5388,10 +5388,10 @@ define <vscale x 4 x i1> @fcmp_ole_vv_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.experimental.constrained.fcmp.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <vscale x 4 x i1> %1
@@ -5404,10 +5404,10 @@ define <vscale x 4 x i1> @fcmp_ole_vf_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5422,10 +5422,10 @@ define <vscale x 4 x i1> @fcmp_ole_fv_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5439,12 +5439,12 @@ define <vscale x 4 x i1> @fcmp_one_vv_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.experimental.constrained.fcmp.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <vscale x 4 x i1> %1
@@ -5457,12 +5457,12 @@ define <vscale x 4 x i1> @fcmp_one_vf_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5477,12 +5477,12 @@ define <vscale x 4 x i1> @fcmp_one_fv_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5538,12 +5538,12 @@ define <vscale x 4 x i1> @fcmp_ueq_vv_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.experimental.constrained.fcmp.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <vscale x 4 x i1> %1
@@ -5556,12 +5556,12 @@ define <vscale x 4 x i1> @fcmp_ueq_vf_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5576,12 +5576,12 @@ define <vscale x 4 x i1> @fcmp_ueq_fv_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5595,8 +5595,8 @@ define <vscale x 4 x i1> @fcmp_ugt_vv_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -5611,8 +5611,8 @@ define <vscale x 4 x i1> @fcmp_ugt_vf_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -5629,8 +5629,8 @@ define <vscale x 4 x i1> @fcmp_ugt_fv_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -5646,8 +5646,8 @@ define <vscale x 4 x i1> @fcmp_uge_vv_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -5662,8 +5662,8 @@ define <vscale x 4 x i1> @fcmp_uge_vf_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -5680,8 +5680,8 @@ define <vscale x 4 x i1> @fcmp_uge_fv_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -5697,8 +5697,8 @@ define <vscale x 4 x i1> @fcmp_ult_vv_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vv v12, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -5713,8 +5713,8 @@ define <vscale x 4 x i1> @fcmp_ult_vf_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -5731,8 +5731,8 @@ define <vscale x 4 x i1> @fcmp_ult_fv_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -5748,8 +5748,8 @@ define <vscale x 4 x i1> @fcmp_ule_vv_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -5764,8 +5764,8 @@ define <vscale x 4 x i1> @fcmp_ule_vf_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -5782,8 +5782,8 @@ define <vscale x 4 x i1> @fcmp_ule_fv_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -5910,10 +5910,10 @@ define <vscale x 8 x i1> @fcmp_ogt_vv_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -5926,10 +5926,10 @@ define <vscale x 8 x i1> @fcmp_ogt_vf_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -5944,10 +5944,10 @@ define <vscale x 8 x i1> @fcmp_ogt_fv_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -5961,10 +5961,10 @@ define <vscale x 8 x i1> @fcmp_oge_vv_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -5977,10 +5977,10 @@ define <vscale x 8 x i1> @fcmp_oge_vf_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -5995,10 +5995,10 @@ define <vscale x 8 x i1> @fcmp_oge_fv_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -6012,10 +6012,10 @@ define <vscale x 8 x i1> @fcmp_olt_vv_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -6028,10 +6028,10 @@ define <vscale x 8 x i1> @fcmp_olt_vf_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -6046,10 +6046,10 @@ define <vscale x 8 x i1> @fcmp_olt_fv_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -6063,10 +6063,10 @@ define <vscale x 8 x i1> @fcmp_ole_vv_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vv v17, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -6079,10 +6079,10 @@ define <vscale x 8 x i1> @fcmp_ole_vf_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -6097,10 +6097,10 @@ define <vscale x 8 x i1> @fcmp_ole_fv_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -6114,12 +6114,12 @@ define <vscale x 8 x i1> @fcmp_one_vv_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -6132,12 +6132,12 @@ define <vscale x 8 x i1> @fcmp_one_vf_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -6152,12 +6152,12 @@ define <vscale x 8 x i1> @fcmp_one_fv_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -6213,12 +6213,12 @@ define <vscale x 8 x i1> @fcmp_ueq_vv_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -6231,12 +6231,12 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -6251,12 +6251,12 @@ define <vscale x 8 x i1> @fcmp_ueq_fv_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -6270,8 +6270,8 @@ define <vscale x 8 x i1> @fcmp_ugt_vv_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -6286,8 +6286,8 @@ define <vscale x 8 x i1> @fcmp_ugt_vf_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6304,8 +6304,8 @@ define <vscale x 8 x i1> @fcmp_ugt_fv_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6321,8 +6321,8 @@ define <vscale x 8 x i1> @fcmp_uge_vv_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -6337,8 +6337,8 @@ define <vscale x 8 x i1> @fcmp_uge_vf_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6355,8 +6355,8 @@ define <vscale x 8 x i1> @fcmp_uge_fv_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6372,8 +6372,8 @@ define <vscale x 8 x i1> @fcmp_ult_vv_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -6388,8 +6388,8 @@ define <vscale x 8 x i1> @fcmp_ult_vf_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6406,8 +6406,8 @@ define <vscale x 8 x i1> @fcmp_ult_fv_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6423,8 +6423,8 @@ define <vscale x 8 x i1> @fcmp_ule_vv_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -6439,8 +6439,8 @@ define <vscale x 8 x i1> @fcmp_ule_vf_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6457,8 +6457,8 @@ define <vscale x 8 x i1> @fcmp_ule_fv_nxv8f32(<vscale x 8 x float> %va, float %b
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -6585,10 +6585,10 @@ define <vscale x 16 x i1> @fcmp_ogt_vv_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmflt.vv v24, v16, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vv v25, v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i1> @llvm.experimental.constrained.fcmp.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x float> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 16 x i1> %1
@@ -6601,10 +6601,10 @@ define <vscale x 16 x i1> @fcmp_ogt_vf_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6619,10 +6619,10 @@ define <vscale x 16 x i1> @fcmp_ogt_fv_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6636,10 +6636,10 @@ define <vscale x 16 x i1> @fcmp_oge_vv_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmfle.vv v24, v16, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmfle.vv v25, v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i1> @llvm.experimental.constrained.fcmp.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x float> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <vscale x 16 x i1> %1
@@ -6652,10 +6652,10 @@ define <vscale x 16 x i1> @fcmp_oge_vf_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfge.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfge.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6670,10 +6670,10 @@ define <vscale x 16 x i1> @fcmp_oge_fv_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6687,10 +6687,10 @@ define <vscale x 16 x i1> @fcmp_olt_vv_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmflt.vv v24, v8, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vv v25, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i1> @llvm.experimental.constrained.fcmp.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x float> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 16 x i1> %1
@@ -6703,10 +6703,10 @@ define <vscale x 16 x i1> @fcmp_olt_vf_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6721,10 +6721,10 @@ define <vscale x 16 x i1> @fcmp_olt_fv_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6738,10 +6738,10 @@ define <vscale x 16 x i1> @fcmp_ole_vv_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmfle.vv v24, v8, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmfle.vv v25, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i1> @llvm.experimental.constrained.fcmp.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x float> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <vscale x 16 x i1> %1
@@ -6754,10 +6754,10 @@ define <vscale x 16 x i1> @fcmp_ole_vf_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6772,10 +6772,10 @@ define <vscale x 16 x i1> @fcmp_ole_fv_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfge.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfge.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6789,12 +6789,12 @@ define <vscale x 16 x i1> @fcmp_one_vv_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmflt.vv v25, v8, v16, v0.t
-; CHECK-NEXT:    vmflt.vv v24, v16, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v24, v25
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmflt.vv v24, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vv v25, v16, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v25, v24
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i1> @llvm.experimental.constrained.fcmp.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x float> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <vscale x 16 x i1> %1
@@ -6807,12 +6807,12 @@ define <vscale x 16 x i1> @fcmp_one_vf_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6827,12 +6827,12 @@ define <vscale x 16 x i1> @fcmp_one_fv_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6888,12 +6888,12 @@ define <vscale x 16 x i1> @fcmp_ueq_vv_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmflt.vv v25, v8, v16, v0.t
-; CHECK-NEXT:    vmflt.vv v24, v16, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v24, v25
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmflt.vv v24, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vv v25, v16, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v25, v24
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i1> @llvm.experimental.constrained.fcmp.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <vscale x 16 x i1> %1
@@ -6906,12 +6906,12 @@ define <vscale x 16 x i1> @fcmp_ueq_vf_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6926,12 +6926,12 @@ define <vscale x 16 x i1> @fcmp_ueq_fv_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6945,8 +6945,8 @@ define <vscale x 16 x i1> @fcmp_ugt_vv_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmfle.vv v24, v8, v16, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v24
 ; CHECK-NEXT:    ret
@@ -6961,8 +6961,8 @@ define <vscale x 16 x i1> @fcmp_ugt_vf_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -6979,8 +6979,8 @@ define <vscale x 16 x i1> @fcmp_ugt_fv_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfge.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -6996,8 +6996,8 @@ define <vscale x 16 x i1> @fcmp_uge_vv_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmflt.vv v24, v8, v16, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v24
 ; CHECK-NEXT:    ret
@@ -7012,8 +7012,8 @@ define <vscale x 16 x i1> @fcmp_uge_vf_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -7030,8 +7030,8 @@ define <vscale x 16 x i1> @fcmp_uge_fv_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -7047,8 +7047,8 @@ define <vscale x 16 x i1> @fcmp_ult_vv_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmfle.vv v24, v16, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v24
 ; CHECK-NEXT:    ret
@@ -7063,8 +7063,8 @@ define <vscale x 16 x i1> @fcmp_ult_vf_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfge.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -7081,8 +7081,8 @@ define <vscale x 16 x i1> @fcmp_ult_fv_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -7098,8 +7098,8 @@ define <vscale x 16 x i1> @fcmp_ule_vv_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmflt.vv v24, v16, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v24
 ; CHECK-NEXT:    ret
@@ -7114,8 +7114,8 @@ define <vscale x 16 x i1> @fcmp_ule_vf_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -7132,8 +7132,8 @@ define <vscale x 16 x i1> @fcmp_ule_fv_nxv16f32(<vscale x 16 x float> %va, float
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -7893,10 +7893,10 @@ define <vscale x 2 x i1> @fcmp_ogt_vv_nxv2f64(<vscale x 2 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 2 x i1> @llvm.experimental.constrained.fcmp.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 2 x i1> %1
@@ -7909,10 +7909,10 @@ define <vscale x 2 x i1> @fcmp_ogt_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -7927,10 +7927,10 @@ define <vscale x 2 x i1> @fcmp_ogt_fv_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -7944,10 +7944,10 @@ define <vscale x 2 x i1> @fcmp_oge_vv_nxv2f64(<vscale x 2 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 2 x i1> @llvm.experimental.constrained.fcmp.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <vscale x 2 x i1> %1
@@ -7960,10 +7960,10 @@ define <vscale x 2 x i1> @fcmp_oge_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -7978,10 +7978,10 @@ define <vscale x 2 x i1> @fcmp_oge_fv_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -7995,10 +7995,10 @@ define <vscale x 2 x i1> @fcmp_olt_vv_nxv2f64(<vscale x 2 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 2 x i1> @llvm.experimental.constrained.fcmp.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 2 x i1> %1
@@ -8011,10 +8011,10 @@ define <vscale x 2 x i1> @fcmp_olt_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -8029,10 +8029,10 @@ define <vscale x 2 x i1> @fcmp_olt_fv_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -8046,10 +8046,10 @@ define <vscale x 2 x i1> @fcmp_ole_vv_nxv2f64(<vscale x 2 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 2 x i1> @llvm.experimental.constrained.fcmp.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <vscale x 2 x i1> %1
@@ -8062,10 +8062,10 @@ define <vscale x 2 x i1> @fcmp_ole_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -8080,10 +8080,10 @@ define <vscale x 2 x i1> @fcmp_ole_fv_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -8097,12 +8097,12 @@ define <vscale x 2 x i1> @fcmp_one_vv_nxv2f64(<vscale x 2 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 2 x i1> @llvm.experimental.constrained.fcmp.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <vscale x 2 x i1> %1
@@ -8115,12 +8115,12 @@ define <vscale x 2 x i1> @fcmp_one_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -8135,12 +8135,12 @@ define <vscale x 2 x i1> @fcmp_one_fv_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -8196,12 +8196,12 @@ define <vscale x 2 x i1> @fcmp_ueq_vv_nxv2f64(<vscale x 2 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vv v13, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vv v13, v10, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 2 x i1> @llvm.experimental.constrained.fcmp.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <vscale x 2 x i1> %1
@@ -8214,12 +8214,12 @@ define <vscale x 2 x i1> @fcmp_ueq_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -8234,12 +8234,12 @@ define <vscale x 2 x i1> @fcmp_ueq_fv_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v11, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v11, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -8253,8 +8253,8 @@ define <vscale x 2 x i1> @fcmp_ugt_vv_nxv2f64(<vscale x 2 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -8269,8 +8269,8 @@ define <vscale x 2 x i1> @fcmp_ugt_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8287,8 +8287,8 @@ define <vscale x 2 x i1> @fcmp_ugt_fv_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8304,8 +8304,8 @@ define <vscale x 2 x i1> @fcmp_uge_vv_nxv2f64(<vscale x 2 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v13, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vv v12, v8, v10, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -8320,8 +8320,8 @@ define <vscale x 2 x i1> @fcmp_uge_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8338,8 +8338,8 @@ define <vscale x 2 x i1> @fcmp_uge_fv_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8355,8 +8355,8 @@ define <vscale x 2 x i1> @fcmp_ult_vv_nxv2f64(<vscale x 2 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vv v12, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -8371,8 +8371,8 @@ define <vscale x 2 x i1> @fcmp_ult_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8389,8 +8389,8 @@ define <vscale x 2 x i1> @fcmp_ult_fv_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8406,8 +8406,8 @@ define <vscale x 2 x i1> @fcmp_ule_vv_nxv2f64(<vscale x 2 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v13, v10, v10
-; CHECK-NEXT:    vmand.mm v12, v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v13, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vv v12, v10, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -8422,8 +8422,8 @@ define <vscale x 2 x i1> @fcmp_ule_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v12, v10
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8440,8 +8440,8 @@ define <vscale x 2 x i1> @fcmp_ule_fv_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
 ; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmand.mm v0, v10, v12
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v10
 ; CHECK-NEXT:    ret
@@ -8568,10 +8568,10 @@ define <vscale x 4 x i1> @fcmp_ogt_vv_nxv4f64(<vscale x 4 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.experimental.constrained.fcmp.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 4 x i1> %1
@@ -8584,10 +8584,10 @@ define <vscale x 4 x i1> @fcmp_ogt_vf_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -8602,10 +8602,10 @@ define <vscale x 4 x i1> @fcmp_ogt_fv_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -8619,10 +8619,10 @@ define <vscale x 4 x i1> @fcmp_oge_vv_nxv4f64(<vscale x 4 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.experimental.constrained.fcmp.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <vscale x 4 x i1> %1
@@ -8635,10 +8635,10 @@ define <vscale x 4 x i1> @fcmp_oge_vf_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -8653,10 +8653,10 @@ define <vscale x 4 x i1> @fcmp_oge_fv_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -8670,10 +8670,10 @@ define <vscale x 4 x i1> @fcmp_olt_vv_nxv4f64(<vscale x 4 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.experimental.constrained.fcmp.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 4 x i1> %1
@@ -8686,10 +8686,10 @@ define <vscale x 4 x i1> @fcmp_olt_vf_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -8704,10 +8704,10 @@ define <vscale x 4 x i1> @fcmp_olt_fv_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -8721,10 +8721,10 @@ define <vscale x 4 x i1> @fcmp_ole_vv_nxv4f64(<vscale x 4 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vv v17, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.experimental.constrained.fcmp.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <vscale x 4 x i1> %1
@@ -8737,10 +8737,10 @@ define <vscale x 4 x i1> @fcmp_ole_vf_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -8755,10 +8755,10 @@ define <vscale x 4 x i1> @fcmp_ole_fv_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -8772,12 +8772,12 @@ define <vscale x 4 x i1> @fcmp_one_vv_nxv4f64(<vscale x 4 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.experimental.constrained.fcmp.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <vscale x 4 x i1> %1
@@ -8790,12 +8790,12 @@ define <vscale x 4 x i1> @fcmp_one_vf_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -8810,12 +8810,12 @@ define <vscale x 4 x i1> @fcmp_one_fv_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -8871,12 +8871,12 @@ define <vscale x 4 x i1> @fcmp_ueq_vv_nxv4f64(<vscale x 4 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vv v17, v8, v12, v0.t
-; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vv v17, v12, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.experimental.constrained.fcmp.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <vscale x 4 x i1> %1
@@ -8889,12 +8889,12 @@ define <vscale x 4 x i1> @fcmp_ueq_vf_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -8909,12 +8909,12 @@ define <vscale x 4 x i1> @fcmp_ueq_fv_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v13, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v12, v13
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v13, v12
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -8928,8 +8928,8 @@ define <vscale x 4 x i1> @fcmp_ugt_vv_nxv4f64(<vscale x 4 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -8944,8 +8944,8 @@ define <vscale x 4 x i1> @fcmp_ugt_vf_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -8962,8 +8962,8 @@ define <vscale x 4 x i1> @fcmp_ugt_fv_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -8979,8 +8979,8 @@ define <vscale x 4 x i1> @fcmp_uge_vv_nxv4f64(<vscale x 4 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v17, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -8995,8 +8995,8 @@ define <vscale x 4 x i1> @fcmp_uge_vf_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9013,8 +9013,8 @@ define <vscale x 4 x i1> @fcmp_uge_fv_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9030,8 +9030,8 @@ define <vscale x 4 x i1> @fcmp_ult_vv_nxv4f64(<vscale x 4 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9046,8 +9046,8 @@ define <vscale x 4 x i1> @fcmp_ult_vf_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfge.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9064,8 +9064,8 @@ define <vscale x 4 x i1> @fcmp_ult_fv_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfle.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9081,8 +9081,8 @@ define <vscale x 4 x i1> @fcmp_ule_vv_nxv4f64(<vscale x 4 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v17, v12, v12
-; CHECK-NEXT:    vmand.mm v16, v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v17, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9097,8 +9097,8 @@ define <vscale x 4 x i1> @fcmp_ule_vf_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v16, v12
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmfgt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9115,8 +9115,8 @@ define <vscale x 4 x i1> @fcmp_ule_fv_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vmfeq.vf v16, v12, fa0
 ; CHECK-NEXT:    vmfeq.vv v12, v8, v8
-; CHECK-NEXT:    vmand.mm v12, v12, v16
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmand.mm v0, v12, v16
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vmflt.vf v12, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v12
 ; CHECK-NEXT:    ret
@@ -9243,10 +9243,10 @@ define <vscale x 8 x i1> @fcmp_ogt_vv_nxv8f64(<vscale x 8 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmflt.vv v24, v16, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vv v25, v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, metadata !"ogt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -9259,10 +9259,10 @@ define <vscale x 8 x i1> @fcmp_ogt_vf_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -9277,10 +9277,10 @@ define <vscale x 8 x i1> @fcmp_ogt_fv_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -9294,10 +9294,10 @@ define <vscale x 8 x i1> @fcmp_oge_vv_nxv8f64(<vscale x 8 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmfle.vv v24, v16, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmfle.vv v25, v16, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, metadata !"oge", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -9310,10 +9310,10 @@ define <vscale x 8 x i1> @fcmp_oge_vf_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfge.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfge.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -9328,10 +9328,10 @@ define <vscale x 8 x i1> @fcmp_oge_fv_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -9345,10 +9345,10 @@ define <vscale x 8 x i1> @fcmp_olt_vv_nxv8f64(<vscale x 8 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmflt.vv v24, v8, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vv v25, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, metadata !"olt", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -9361,10 +9361,10 @@ define <vscale x 8 x i1> @fcmp_olt_vf_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -9379,10 +9379,10 @@ define <vscale x 8 x i1> @fcmp_olt_fv_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -9396,10 +9396,10 @@ define <vscale x 8 x i1> @fcmp_ole_vv_nxv8f64(<vscale x 8 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmfle.vv v24, v8, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmfle.vv v25, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, metadata !"ole", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -9412,10 +9412,10 @@ define <vscale x 8 x i1> @fcmp_ole_vf_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfle.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfle.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -9430,10 +9430,10 @@ define <vscale x 8 x i1> @fcmp_ole_fv_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfge.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfge.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -9447,12 +9447,12 @@ define <vscale x 8 x i1> @fcmp_one_vv_nxv8f64(<vscale x 8 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmflt.vv v25, v8, v16, v0.t
-; CHECK-NEXT:    vmflt.vv v24, v16, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v24, v25
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmflt.vv v24, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vv v25, v16, v8, v0.t
+; CHECK-NEXT:    vmor.mm v0, v25, v24
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, metadata !"one", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -9465,12 +9465,12 @@ define <vscale x 8 x i1> @fcmp_one_vf_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -9485,12 +9485,12 @@ define <vscale x 8 x i1> @fcmp_one_fv_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -9546,12 +9546,12 @@ define <vscale x 8 x i1> @fcmp_ueq_vv_nxv8f64(<vscale x 8 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmflt.vv v25, v8, v16, v0.t
-; CHECK-NEXT:    vmflt.vv v24, v16, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v24, v25
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmflt.vv v24, v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    vmflt.vv v25, v16, v8, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v25, v24
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i1> @llvm.experimental.constrained.fcmp.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   ret <vscale x 8 x i1> %1
@@ -9564,12 +9564,12 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
-; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -9584,12 +9584,12 @@ define <vscale x 8 x i1> @fcmp_ueq_fv_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v17, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmfgt.vf v17, v8, fa0, v0.t
-; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v16, v17
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v17, v0
+; CHECK-NEXT:    vmflt.vf v17, v8, fa0, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v17, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -9603,8 +9603,8 @@ define <vscale x 8 x i1> @fcmp_ugt_vv_nxv8f64(<vscale x 8 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmfle.vv v24, v8, v16, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v24
 ; CHECK-NEXT:    ret
@@ -9619,8 +9619,8 @@ define <vscale x 8 x i1> @fcmp_ugt_vf_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9637,8 +9637,8 @@ define <vscale x 8 x i1> @fcmp_ugt_fv_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfge.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9654,8 +9654,8 @@ define <vscale x 8 x i1> @fcmp_uge_vv_nxv8f64(<vscale x 8 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmflt.vv v24, v8, v16, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v24
 ; CHECK-NEXT:    ret
@@ -9670,8 +9670,8 @@ define <vscale x 8 x i1> @fcmp_uge_vf_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9688,8 +9688,8 @@ define <vscale x 8 x i1> @fcmp_uge_fv_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9705,8 +9705,8 @@ define <vscale x 8 x i1> @fcmp_ult_vv_nxv8f64(<vscale x 8 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmfle.vv v24, v16, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v24
 ; CHECK-NEXT:    ret
@@ -9721,8 +9721,8 @@ define <vscale x 8 x i1> @fcmp_ult_vf_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfge.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9739,8 +9739,8 @@ define <vscale x 8 x i1> @fcmp_ult_fv_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfle.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9756,8 +9756,8 @@ define <vscale x 8 x i1> @fcmp_ule_vv_nxv8f64(<vscale x 8 x double> %va, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfeq.vv v24, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16
-; CHECK-NEXT:    vmand.mm v24, v25, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmand.mm v0, v25, v24
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vmflt.vv v24, v16, v8, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v24
 ; CHECK-NEXT:    ret
@@ -9772,8 +9772,8 @@ define <vscale x 8 x i1> @fcmp_ule_vf_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v24, v16
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmfgt.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
@@ -9790,8 +9790,8 @@ define <vscale x 8 x i1> @fcmp_ule_fv_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
 ; CHECK-NEXT:    vmfeq.vf v24, v16, fa0
 ; CHECK-NEXT:    vmfeq.vv v16, v8, v8
-; CHECK-NEXT:    vmand.mm v16, v16, v24
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmand.mm v0, v16, v24
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vmflt.vf v16, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index 3ad17e85570a20..d5cc567e1b7183 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -515,7 +515,6 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -524,6 +523,7 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
@@ -621,12 +621,11 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ;
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v24, fa5
+; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -635,22 +634,23 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v3, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfdiv.vv v24, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index a41c2621161366..6d24bc262029f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -1108,19 +1108,19 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 40
+; CHECK-NEXT:    li a3, 50
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x32, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 50 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    li a3, 18
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a3, 34
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -1130,7 +1130,8 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    add a5, a2, a3
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
+; CHECK-NEXT:    li a6, 26
+; CHECK-NEXT:    mul a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
@@ -1141,31 +1142,61 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    srli a6, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v8, (a3)
-; CHECK-NEXT:    vl8re64.v v16, (a2)
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a7, 42
+; CHECK-NEXT:    mul a3, a3, a7
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a0)
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v0, a6
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a2, a0, 4
+; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs1r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vslidedown.vx v16, v0, a6
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs1r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    li a2, 26
 ; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a2, 18
+; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a2, 42
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a2, 42
 ; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
@@ -1175,29 +1206,35 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB92_2:
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 34
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    li a1, 42
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
+; CHECK-NEXT:    li a1, 50
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
index 7556b3ace5c6cb..1fe8daa2c26012 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
@@ -245,7 +245,6 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -254,6 +253,7 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
index 755c665376128a..aab326b1419b45 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
@@ -245,7 +245,6 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -254,6 +253,7 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index 30d5919238cfa5..9a970dddf8e374 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -515,7 +515,6 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -524,6 +523,7 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
@@ -621,12 +621,11 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ;
 ; ZVFHMIN-LABEL: vfmul_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v24, fa5
+; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -635,22 +634,23 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v3, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfmul.vv v24, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
index 292f27794f378a..f4ab947406f362 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
@@ -1108,19 +1108,19 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 40
+; CHECK-NEXT:    li a3, 50
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x32, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 50 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    li a3, 18
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a3, 34
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -1130,7 +1130,8 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    add a5, a2, a3
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
+; CHECK-NEXT:    li a6, 26
+; CHECK-NEXT:    mul a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
@@ -1141,31 +1142,61 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    srli a6, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v8, (a3)
-; CHECK-NEXT:    vl8re64.v v16, (a2)
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a7, 42
+; CHECK-NEXT:    mul a3, a3, a7
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a0)
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v0, a6
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a2, a0, 4
+; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs1r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vslidedown.vx v16, v0, a6
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs1r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    li a2, 26
 ; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a2, 18
+; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a2, 42
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a2, 42
 ; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
@@ -1175,29 +1206,35 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB92_2:
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 34
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    li a1, 42
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
+; CHECK-NEXT:    li a1, 50
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
index 1db5fa1720a276..63c7313ff7faa7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
@@ -229,7 +229,6 @@ define <vscale x 32 x half> @vfneg_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -238,21 +237,22 @@ define <vscale x 32 x half> @vfneg_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v24, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfneg.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB10_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfneg.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
+; ZVFHMIN-NEXT:    vfneg.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -526,10 +526,10 @@ declare <vscale x 16 x double> @llvm.vp.fneg.nxv16f64(<vscale x 16 x double>, <v
 define <vscale x 16 x double> @vfneg_vv_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfneg_vv_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
@@ -542,7 +542,7 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfneg.v v8, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.fneg.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
index 5cfa98916a2de0..5127933811d9a3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
@@ -96,10 +96,10 @@ declare <vscale x 32 x float> @llvm.vp.fpext.nxv32f32.nxv32f16(<vscale x 32 x ha
 define <vscale x 32 x float> @vfpext_nxv32f16_nxv32f32(<vscale x 32 x half> %a, <vscale x 32 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vfpext_nxv32f16_nxv32f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index 15c4bf255e6dce..1fc23099d80a4b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -394,10 +394,10 @@ declare <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x flo
 define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv32i16_nxv32f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -411,7 +411,7 @@ define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v24, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v24
 ; CHECK-NEXT:    ret
@@ -424,10 +424,10 @@ declare <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x flo
 define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv32i32_nxv32f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -441,7 +441,7 @@ define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index a2591e7dc35f03..af7b4d45feb48b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -394,10 +394,10 @@ declare <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x flo
 define <vscale x 32 x i16> @vfptoui_nxv32i16_nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv32i16_nxv32f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -411,7 +411,7 @@ define <vscale x 32 x i16> @vfptoui_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v24, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v24
 ; CHECK-NEXT:    ret
@@ -424,10 +424,10 @@ declare <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x flo
 define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv32i32_nxv32f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -441,7 +441,7 @@ define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index dd122f1f251103..b85197d554ba0c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -96,39 +96,25 @@ declare <vscale x 16 x float> @llvm.vp.fptrunc.nxv16f64.nxv16f32(<vscale x 16 x
 define <vscale x 16 x float> @vfptrunc_nxv16f32_nxv16f64(<vscale x 16 x double> %a, <vscale x 16 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vfptrunc_nxv16f32_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfncvt.f.f.w v20, v24, v0.t
+; CHECK-NEXT:    vfncvt.f.f.w v28, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB7_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vfncvt.f.f.w v16, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vfncvt.f.f.w v24, v8, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.fptrunc.nxv16f64.nxv16f32(<vscale x 16 x double> %a, <vscale x 16 x i1> %m, i32 %vl)
   ret <vscale x 16 x float> %v
@@ -145,7 +131,6 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -154,49 +139,51 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    srli a4, a1, 2
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v0, a4
-; CHECK-NEXT:    slli a4, a1, 3
-; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    vl8re64.v v8, (a4)
-; CHECK-NEXT:    slli a4, a1, 1
-; CHECK-NEXT:    sub a5, a2, a4
-; CHECK-NEXT:    sltu a6, a2, a5
+; CHECK-NEXT:    srli a5, a1, 3
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a5
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v25, v0, a3
+; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vl8re64.v v16, (a3)
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    slli a3, a1, 1
+; CHECK-NEXT:    sub a4, a2, a3
+; CHECK-NEXT:    sltu a6, a2, a4
 ; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    sub a6, a5, a1
-; CHECK-NEXT:    sltu a7, a5, a6
+; CHECK-NEXT:    and a4, a6, a4
+; CHECK-NEXT:    sub a6, a4, a1
+; CHECK-NEXT:    sltu a7, a4, a6
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    and a6, a7, a6
-; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vslidedown.vx v0, v16, a3
-; CHECK-NEXT:    vsetvli zero, a6, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v20, v8, v0.t
-; CHECK-NEXT:    bltu a5, a1, .LBB8_2
+; CHECK-NEXT:    and a0, a7, a6
+; CHECK-NEXT:    vmv1r.v v24, v25
+; CHECK-NEXT:    vslidedown.vx v0, v25, a5
+; CHECK-NEXT:    bltu a4, a1, .LBB8_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a5, a1
+; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB8_2:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v7, a3
-; CHECK-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vfncvt.f.f.w v16, v24, v0.t
-; CHECK-NEXT:    bltu a2, a4, .LBB8_4
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vfncvt.f.f.w v28, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, a4, e32, m4, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vfncvt.f.f.w v24, v8, v0.t
+; CHECK-NEXT:    bltu a2, a3, .LBB8_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv a2, a4
+; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB8_4:
 ; CHECK-NEXT:    sub a0, a2, a1
 ; CHECK-NEXT:    sltu a3, a2, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfncvt.f.f.w v28, v8, v0.t
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfncvt.f.f.w v12, v16, v0.t
 ; CHECK-NEXT:    bltu a2, a1, .LBB8_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a1
@@ -207,9 +194,9 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfncvt.f.f.w v24, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index d6caad15e40a2c..36dadb5f1323e1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -229,7 +229,6 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -238,21 +237,22 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v24, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsqrt.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfsqrt.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB10_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vfsqrt.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
+; ZVFHMIN-NEXT:    vfsqrt.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -526,10 +526,10 @@ declare <vscale x 16 x double> @llvm.vp.sqrt.nxv16f64(<vscale x 16 x double>, <v
 define <vscale x 16 x double> @vfsqrt_vv_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsqrt_vv_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
@@ -542,7 +542,7 @@ define <vscale x 16 x double> @vfsqrt_vv_nxv16f64(<vscale x 16 x double> %va, <v
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfsqrt.v v8, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.sqrt.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index 2eae18d7cc4937..c8b894cbdbb163 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -515,7 +515,6 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -524,6 +523,7 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
@@ -621,12 +621,11 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ;
 ; ZVFHMIN-LABEL: vfsub_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v24, fa5
+; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -635,22 +634,23 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v3, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfsub.vv v24, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
index ca0bbfd65ca29c..ceceff23c868f6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
@@ -710,17 +710,31 @@ define <vscale x 16 x float> @vfnmacc_vf_nxv16f32_commute(<vscale x 16 x half> %
 ;
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv16f32_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv4r.v v24, v8
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v8, fa5
+; ZVFHMIN-NEXT:    vfmv.v.f v24, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v8
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfnmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v8, v24
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
index 2797ca2eb31630..bfef07b1b44da9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
@@ -682,17 +682,31 @@ define <vscale x 16 x float> @vfnmsac_vf_nxv16f32_commute(<vscale x 16 x half> %
 ;
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv16f32_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv4r.v v24, v8
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v8, fa5
+; ZVFHMIN-NEXT:    vfmv.v.f v24, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v8
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfnmsub.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v8, v24
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
index 15cb42bacf1735..a8cf307e6fadd2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
@@ -49,8 +49,8 @@ define i64 @test_vleff_nxv8i8_mask(<vscale x 8 x i8> %maskedoff, ptr %p, <vscale
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
-  ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_MASK:%[0-9]+]]:vrnov0, [[PseudoVLE8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1_MASK [[COPY3]], [[COPY2]], $v0, [[COPY]], 3 /* e8 */, 0 /* tu, mu */, implicit-def dead $vl :: (load unknown-size from %ir.p, align 1)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_MASK:%[0-9]+]]:vrnov0, [[PseudoVLE8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */, implicit-def dead $vl :: (load unknown-size from %ir.p, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLE8FF_V_M1_MASK1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
@@ -108,8 +108,8 @@ define i64 @test_vlseg2ff_nxv8i8_mask(<vscale x 8 x i8> %val, ptr %base, <vscale
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vr = COPY $v8
   ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vrn2m1nov0 = REG_SEQUENCE [[COPY3]], %subreg.sub_vrm1_0, [[COPY3]], %subreg.sub_vrm1_1
-  ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_MASK:%[0-9]+]]:vrn2m1nov0, [[PseudoVLSEG2E8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1_MASK [[REG_SEQUENCE]], [[COPY2]], $v0, [[COPY]], 3 /* e8 */, 0 /* tu, mu */, implicit-def dead $vl :: (load unknown-size from %ir.base, align 1)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_MASK:%[0-9]+]]:vrn2m1nov0, [[PseudoVLSEG2E8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1_MASK [[REG_SEQUENCE]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */, implicit-def dead $vl :: (load unknown-size from %ir.base, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLSEG2E8FF_V_M1_MASK1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index a35fc874065a70..05cbee2caad0f2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -408,7 +408,7 @@ declare <vscale x 128 x i8> @llvm.vp.smax.nxv128i8(<vscale x 128 x i8>, <vscale
 define <vscale x 128 x i8> @vmax_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmax_vx_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -424,7 +424,7 @@ define <vscale x 128 x i8> @vmax_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vs
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 128 x i8> poison, i8 %b, i32 0
@@ -970,10 +970,10 @@ declare <vscale x 32 x i32> @llvm.vp.smax.nxv32i32(<vscale x 32 x i32>, <vscale
 define <vscale x 32 x i32> @vmax_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmax_vx_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    srli a3, a2, 2
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
@@ -987,7 +987,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <v
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB80_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
@@ -1030,10 +1030,10 @@ declare i32 @llvm.vscale.i32()
 define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m) {
 ; CHECK-LABEL: vmax_vx_nxv32i32_evl_nx8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a2, a1, 1
 ; CHECK-NEXT:    sub a3, a1, a2
@@ -1047,7 +1047,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB82_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index 1f620a44dbbc89..c9ebae3d46db01 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -410,7 +410,7 @@ declare <vscale x 128 x i8> @llvm.vp.umax.nxv128i8(<vscale x 128 x i8>, <vscale
 define <vscale x 128 x i8> @vmaxu_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmaxu_vx_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -426,7 +426,7 @@ define <vscale x 128 x i8> @vmaxu_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <v
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmaxu.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 128 x i8> poison, i8 %b, i32 0
@@ -972,10 +972,10 @@ declare <vscale x 32 x i32> @llvm.vp.umax.nxv32i32(<vscale x 32 x i32>, <vscale
 define <vscale x 32 x i32> @vmaxu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmaxu_vx_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    srli a3, a2, 2
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
@@ -989,7 +989,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB80_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmaxu.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
@@ -1032,10 +1032,10 @@ declare i32 @llvm.vscale.i32()
 define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m) {
 ; CHECK-LABEL: vmaxu_vx_nxv32i32_evl_nx8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a2, a1, 1
 ; CHECK-NEXT:    sub a3, a1, a2
@@ -1049,7 +1049,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB82_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmaxu.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll b/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll
index ffeb399291e1f9..e7184921d87a08 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1f16(
 define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmfeq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
 ; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2f16(
 define <vscale x 2 x i1> @intrinsic_vmfeq_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmfeq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
 ; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4f16(
 define <vscale x 4 x i1> @intrinsic_vmfeq_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmfeq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
 ; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,12 +187,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8f16(
 define <vscale x 8 x i1> @intrinsic_vmfeq_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmfeq.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v10
+; CHECK-NEXT:    vmfeq.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmfeq.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8f16(
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16f16(
 define <vscale x 16 x i1> @intrinsic_vmfeq_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmfeq.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v12
+; CHECK-NEXT:    vmfeq.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmfeq.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16f16(
@@ -294,10 +289,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1f32(
 define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmfeq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
 ; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -346,10 +340,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2f32(
 define <vscale x 2 x i1> @intrinsic_vmfeq_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmfeq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
 ; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,12 +391,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4f32(
 define <vscale x 4 x i1> @intrinsic_vmfeq_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmfeq.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v10
+; CHECK-NEXT:    vmfeq.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmfeq.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4f32(
@@ -450,12 +442,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8f32(
 define <vscale x 8 x i1> @intrinsic_vmfeq_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmfeq.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v12
+; CHECK-NEXT:    vmfeq.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmfeq.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8f32(
@@ -502,10 +493,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1f64(
 define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmfeq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
 ; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -554,12 +544,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2f64(
 define <vscale x 2 x i1> @intrinsic_vmfeq_mask_vv_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmfeq.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v10
+; CHECK-NEXT:    vmfeq.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmfeq.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2f64(
@@ -606,12 +595,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4f64(
 define <vscale x 4 x i1> @intrinsic_vmfeq_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmfeq.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v12
+; CHECK-NEXT:    vmfeq.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmfeq.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfge.ll b/llvm/test/CodeGen/RISCV/rvv/vmfge.ll
index 993b50a1c81ce8..a6dad9eaa4f358 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfge.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1f16(
 define <vscale x 1 x i1> @intrinsic_vmfge_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
 ; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2f16(
 define <vscale x 2 x i1> @intrinsic_vmfge_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
 ; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4f16(
 define <vscale x 4 x i1> @intrinsic_vmfge_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
 ; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,12 +187,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8f16(
 define <vscale x 8 x i1> @intrinsic_vmfge_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmfle.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfle.vv v0, v10, v8
+; CHECK-NEXT:    vmfle.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmfle.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8f16(
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16f16(
 define <vscale x 16 x i1> @intrinsic_vmfge_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmfle.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfle.vv v0, v12, v8
+; CHECK-NEXT:    vmfle.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmfle.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16f16(
@@ -294,10 +289,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1f32(
 define <vscale x 1 x i1> @intrinsic_vmfge_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
 ; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -346,10 +340,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2f32(
 define <vscale x 2 x i1> @intrinsic_vmfge_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
 ; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,12 +391,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4f32(
 define <vscale x 4 x i1> @intrinsic_vmfge_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmfle.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfle.vv v0, v10, v8
+; CHECK-NEXT:    vmfle.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmfle.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4f32(
@@ -450,12 +442,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8f32(
 define <vscale x 8 x i1> @intrinsic_vmfge_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmfle.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfle.vv v0, v12, v8
+; CHECK-NEXT:    vmfle.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmfle.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8f32(
@@ -502,10 +493,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1f64(
 define <vscale x 1 x i1> @intrinsic_vmfge_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
 ; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -554,12 +544,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2f64(
 define <vscale x 2 x i1> @intrinsic_vmfge_mask_vv_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmfle.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfle.vv v0, v10, v8
+; CHECK-NEXT:    vmfle.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmfle.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2f64(
@@ -606,12 +595,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4f64(
 define <vscale x 4 x i1> @intrinsic_vmfge_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmfle.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfle.vv v0, v12, v8
+; CHECK-NEXT:    vmfle.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmfle.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll b/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll
index 427f0eb28e7df5..f643a4036381c3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1f16(
 define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
 ; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2f16(
 define <vscale x 2 x i1> @intrinsic_vmfgt_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
 ; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4f16(
 define <vscale x 4 x i1> @intrinsic_vmfgt_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
 ; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,12 +187,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8f16(
 define <vscale x 8 x i1> @intrinsic_vmfgt_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmflt.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmflt.vv v0, v10, v8
+; CHECK-NEXT:    vmflt.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmflt.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8f16(
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16f16(
 define <vscale x 16 x i1> @intrinsic_vmfgt_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmflt.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmflt.vv v0, v12, v8
+; CHECK-NEXT:    vmflt.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16f16(
@@ -294,10 +289,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1f32(
 define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
 ; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -346,10 +340,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2f32(
 define <vscale x 2 x i1> @intrinsic_vmfgt_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
 ; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,12 +391,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4f32(
 define <vscale x 4 x i1> @intrinsic_vmfgt_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmflt.vv v0, v10, v8
+; CHECK-NEXT:    vmflt.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmflt.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4f32(
@@ -450,12 +442,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8f32(
 define <vscale x 8 x i1> @intrinsic_vmfgt_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmflt.vv v0, v12, v8
+; CHECK-NEXT:    vmflt.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8f32(
@@ -502,10 +493,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1f64(
 define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
 ; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -554,12 +544,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2f64(
 define <vscale x 2 x i1> @intrinsic_vmfgt_mask_vv_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmflt.vv v0, v10, v8
+; CHECK-NEXT:    vmflt.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmflt.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2f64(
@@ -606,12 +595,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4f64(
 define <vscale x 4 x i1> @intrinsic_vmfgt_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmflt.vv v0, v12, v8
+; CHECK-NEXT:    vmflt.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfle.ll b/llvm/test/CodeGen/RISCV/rvv/vmfle.ll
index e5327632fc04f6..6c52364c1fbd56 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfle.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfle.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1f16(
 define <vscale x 1 x i1> @intrinsic_vmfle_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
 ; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2f16(
 define <vscale x 2 x i1> @intrinsic_vmfle_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
 ; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4f16(
 define <vscale x 4 x i1> @intrinsic_vmfle_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
 ; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,12 +187,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8f16(
 define <vscale x 8 x i1> @intrinsic_vmfle_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmfle.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfle.vv v0, v8, v10
+; CHECK-NEXT:    vmfle.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmfle.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8f16(
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16f16(
 define <vscale x 16 x i1> @intrinsic_vmfle_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmfle.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfle.vv v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmfle.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16f16(
@@ -294,10 +289,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1f32(
 define <vscale x 1 x i1> @intrinsic_vmfle_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
 ; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -346,10 +340,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2f32(
 define <vscale x 2 x i1> @intrinsic_vmfle_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
 ; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,12 +391,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4f32(
 define <vscale x 4 x i1> @intrinsic_vmfle_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmfle.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfle.vv v0, v8, v10
+; CHECK-NEXT:    vmfle.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmfle.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4f32(
@@ -450,12 +442,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8f32(
 define <vscale x 8 x i1> @intrinsic_vmfle_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmfle.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfle.vv v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmfle.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8f32(
@@ -502,10 +493,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1f64(
 define <vscale x 1 x i1> @intrinsic_vmfle_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
 ; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -554,12 +544,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2f64(
 define <vscale x 2 x i1> @intrinsic_vmfle_mask_vv_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmfle.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfle.vv v0, v8, v10
+; CHECK-NEXT:    vmfle.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmfle.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2f64(
@@ -606,12 +595,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4f64(
 define <vscale x 4 x i1> @intrinsic_vmfle_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmfle.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfle.vv v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmfle.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmflt.ll b/llvm/test/CodeGen/RISCV/rvv/vmflt.ll
index 64f257e355ceae..37a9c6b081a1df 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmflt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmflt.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1f16(
 define <vscale x 1 x i1> @intrinsic_vmflt_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
 ; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2f16(
 define <vscale x 2 x i1> @intrinsic_vmflt_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
 ; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4f16(
 define <vscale x 4 x i1> @intrinsic_vmflt_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
 ; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,12 +187,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8f16(
 define <vscale x 8 x i1> @intrinsic_vmflt_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmflt.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmflt.vv v0, v8, v10
+; CHECK-NEXT:    vmflt.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmflt.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8f16(
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16f16(
 define <vscale x 16 x i1> @intrinsic_vmflt_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmflt.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmflt.vv v0, v8, v12
+; CHECK-NEXT:    vmflt.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmflt.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16f16(
@@ -294,10 +289,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1f32(
 define <vscale x 1 x i1> @intrinsic_vmflt_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
 ; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -346,10 +340,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2f32(
 define <vscale x 2 x i1> @intrinsic_vmflt_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
 ; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,12 +391,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4f32(
 define <vscale x 4 x i1> @intrinsic_vmflt_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmflt.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmflt.vv v0, v8, v10
+; CHECK-NEXT:    vmflt.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmflt.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4f32(
@@ -450,12 +442,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8f32(
 define <vscale x 8 x i1> @intrinsic_vmflt_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmflt.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmflt.vv v0, v8, v12
+; CHECK-NEXT:    vmflt.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmflt.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8f32(
@@ -502,10 +493,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1f64(
 define <vscale x 1 x i1> @intrinsic_vmflt_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
 ; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -554,12 +544,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2f64(
 define <vscale x 2 x i1> @intrinsic_vmflt_mask_vv_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmflt.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmflt.vv v0, v8, v10
+; CHECK-NEXT:    vmflt.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmflt.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2f64(
@@ -606,12 +595,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4f64(
 define <vscale x 4 x i1> @intrinsic_vmflt_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmflt.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmflt.vv v0, v8, v12
+; CHECK-NEXT:    vmflt.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmflt.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfne.ll b/llvm/test/CodeGen/RISCV/rvv/vmfne.ll
index 6f6a2a5e8783c6..5defce42091e55 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfne.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfne.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1f16(
 define <vscale x 1 x i1> @intrinsic_vmfne_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmfne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
 ; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2f16(
 define <vscale x 2 x i1> @intrinsic_vmfne_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmfne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
 ; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4f16(
 define <vscale x 4 x i1> @intrinsic_vmfne_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmfne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
 ; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,12 +187,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8f16(
 define <vscale x 8 x i1> @intrinsic_vmfne_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmfne.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfne.vv v0, v8, v10
+; CHECK-NEXT:    vmfne.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmfne.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8f16(
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16f16(
 define <vscale x 16 x i1> @intrinsic_vmfne_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x half> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmfne.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfne.vv v0, v8, v12
+; CHECK-NEXT:    vmfne.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmfne.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16f16(
@@ -294,10 +289,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1f32(
 define <vscale x 1 x i1> @intrinsic_vmfne_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmfne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
 ; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -346,10 +340,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2f32(
 define <vscale x 2 x i1> @intrinsic_vmfne_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmfne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
 ; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,12 +391,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4f32(
 define <vscale x 4 x i1> @intrinsic_vmfne_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmfne.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfne.vv v0, v8, v10
+; CHECK-NEXT:    vmfne.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmfne.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4f32(
@@ -450,12 +442,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8f32(
 define <vscale x 8 x i1> @intrinsic_vmfne_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmfne.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfne.vv v0, v8, v12
+; CHECK-NEXT:    vmfne.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmfne.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8f32(
@@ -502,10 +493,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1f64(
 define <vscale x 1 x i1> @intrinsic_vmfne_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmfne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
 ; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -554,12 +544,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2f64(
 define <vscale x 2 x i1> @intrinsic_vmfne_mask_vv_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmfne.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfne.vv v0, v8, v10
+; CHECK-NEXT:    vmfne.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmfne.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2f64(
@@ -606,12 +595,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4f64(
 define <vscale x 4 x i1> @intrinsic_vmfne_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x double> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmfne.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmfne.vv v0, v8, v12
+; CHECK-NEXT:    vmfne.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmfne.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 8fabf93356aebb..34ee1cc4a0123c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -408,7 +408,7 @@ declare <vscale x 128 x i8> @llvm.vp.smin.nxv128i8(<vscale x 128 x i8>, <vscale
 define <vscale x 128 x i8> @vmin_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmin_vx_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -424,7 +424,7 @@ define <vscale x 128 x i8> @vmin_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vs
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 128 x i8> poison, i8 %b, i32 0
@@ -970,10 +970,10 @@ declare <vscale x 32 x i32> @llvm.vp.smin.nxv32i32(<vscale x 32 x i32>, <vscale
 define <vscale x 32 x i32> @vmin_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmin_vx_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    srli a3, a2, 2
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
@@ -987,7 +987,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <v
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB80_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
@@ -1030,10 +1030,10 @@ declare i32 @llvm.vscale.i32()
 define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m) {
 ; CHECK-LABEL: vmin_vx_nxv32i32_evl_nx8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a2, a1, 1
 ; CHECK-NEXT:    sub a3, a1, a2
@@ -1047,7 +1047,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB82_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index 8ec85e545a0f8e..0e69992ad8791c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -410,7 +410,7 @@ declare <vscale x 128 x i8> @llvm.vp.umin.nxv128i8(<vscale x 128 x i8>, <vscale
 define <vscale x 128 x i8> @vminu_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vminu_vx_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -426,7 +426,7 @@ define <vscale x 128 x i8> @vminu_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <v
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 128 x i8> poison, i8 %b, i32 0
@@ -972,10 +972,10 @@ declare <vscale x 32 x i32> @llvm.vp.umin.nxv32i32(<vscale x 32 x i32>, <vscale
 define <vscale x 32 x i32> @vminu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vminu_vx_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    srli a3, a2, 2
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
@@ -989,7 +989,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB80_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
@@ -1032,10 +1032,10 @@ declare i32 @llvm.vscale.i32()
 define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m) {
 ; CHECK-LABEL: vminu_vx_nxv32i32_evl_nx8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a2, a1, 1
 ; CHECK-NEXT:    sub a3, a1, a2
@@ -1049,7 +1049,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB82_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmseq.ll b/llvm/test/CodeGen/RISCV/rvv/vmseq.ll
index da1c751b566304..e2f53818606c74 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmseq.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmseq.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmseq.mask.nxv1i8(
 define <vscale x 1 x i1> @intrinsic_vmseq_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmseq.mask.nxv2i8(
 define <vscale x 2 x i1> @intrinsic_vmseq_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmseq.mask.nxv4i8(
 define <vscale x 4 x i1> @intrinsic_vmseq_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmseq.mask.nxv8i8(
 define <vscale x 8 x i1> @intrinsic_vmseq_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmseq.mask.nxv16i8(
 define <vscale x 16 x i1> @intrinsic_vmseq_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vmseq.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmseq.vv v0, v8, v10
+; CHECK-NEXT:    vmseq.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmseq.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmseq.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmseq.mask.nxv32i8(
 define <vscale x 32 x i1> @intrinsic_vmseq_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vmseq.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmseq.vv v0, v8, v12
+; CHECK-NEXT:    vmseq.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmseq.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 32 x i1> @llvm.riscv.vmseq.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmseq.mask.nxv1i16(
 define <vscale x 1 x i1> @intrinsic_vmseq_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmseq.mask.nxv2i16(
 define <vscale x 2 x i1> @intrinsic_vmseq_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmseq.mask.nxv4i16(
 define <vscale x 4 x i1> @intrinsic_vmseq_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmseq.mask.nxv8i16(
 define <vscale x 8 x i1> @intrinsic_vmseq_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmseq.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmseq.vv v0, v8, v10
+; CHECK-NEXT:    vmseq.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmseq.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmseq.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmseq.mask.nxv16i16(
 define <vscale x 16 x i1> @intrinsic_vmseq_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmseq.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmseq.vv v0, v8, v12
+; CHECK-NEXT:    vmseq.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmseq.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmseq.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmseq.mask.nxv1i32(
 define <vscale x 1 x i1> @intrinsic_vmseq_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmseq.mask.nxv2i32(
 define <vscale x 2 x i1> @intrinsic_vmseq_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmseq.mask.nxv4i32(
 define <vscale x 4 x i1> @intrinsic_vmseq_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmseq.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmseq.vv v0, v8, v10
+; CHECK-NEXT:    vmseq.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmseq.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmseq.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmseq.mask.nxv8i32(
 define <vscale x 8 x i1> @intrinsic_vmseq_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmseq.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmseq.vv v0, v8, v12
+; CHECK-NEXT:    vmseq.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmseq.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmseq.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmseq.mask.nxv1i64(
 define <vscale x 1 x i1> @intrinsic_vmseq_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmseq.mask.nxv2i64(
 define <vscale x 2 x i1> @intrinsic_vmseq_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmseq.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmseq.vv v0, v8, v10
+; CHECK-NEXT:    vmseq.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmseq.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmseq.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmseq.mask.nxv4i64(
 define <vscale x 4 x i1> @intrinsic_vmseq_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmseq.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmseq.vv v0, v8, v12
+; CHECK-NEXT:    vmseq.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmseq.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmseq.nxv4i64(
@@ -1692,11 +1674,11 @@ define <vscale x 1 x i1> @intrinsic_vmseq_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vmseq.vv v10, v8, v11, v0.t
-; RV32-NEXT:    vmv.v.v v0, v10
+; RV32-NEXT:    vmseq.vv v11, v8, v10, v0.t
+; RV32-NEXT:    vmv.v.v v0, v11
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll
index b7a676e7f2dd37..bd64093fd072d7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgt.mask.nxv1i8(
 define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgt.mask.nxv2i8(
 define <vscale x 2 x i1> @intrinsic_vmsgt_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgt.mask.nxv4i8(
 define <vscale x 4 x i1> @intrinsic_vmsgt_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgt.mask.nxv8i8(
 define <vscale x 8 x i1> @intrinsic_vmsgt_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsgt.mask.nxv16i8(
 define <vscale x 16 x i1> @intrinsic_vmsgt_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vmslt.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v10, v8
+; CHECK-NEXT:    vmslt.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmslt.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsgt.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsgt.mask.nxv32i8(
 define <vscale x 32 x i1> @intrinsic_vmsgt_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vmslt.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v12, v8
+; CHECK-NEXT:    vmslt.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmslt.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 32 x i1> @llvm.riscv.vmsgt.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgt.mask.nxv1i16(
 define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgt.mask.nxv2i16(
 define <vscale x 2 x i1> @intrinsic_vmsgt_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgt.mask.nxv4i16(
 define <vscale x 4 x i1> @intrinsic_vmsgt_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgt.mask.nxv8i16(
 define <vscale x 8 x i1> @intrinsic_vmsgt_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmslt.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v10, v8
+; CHECK-NEXT:    vmslt.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmslt.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsgt.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsgt.mask.nxv16i16(
 define <vscale x 16 x i1> @intrinsic_vmsgt_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmslt.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v12, v8
+; CHECK-NEXT:    vmslt.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmslt.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsgt.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgt.mask.nxv1i32(
 define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgt.mask.nxv2i32(
 define <vscale x 2 x i1> @intrinsic_vmsgt_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgt.mask.nxv4i32(
 define <vscale x 4 x i1> @intrinsic_vmsgt_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmslt.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v10, v8
+; CHECK-NEXT:    vmslt.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmslt.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsgt.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgt.mask.nxv8i32(
 define <vscale x 8 x i1> @intrinsic_vmsgt_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmslt.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v12, v8
+; CHECK-NEXT:    vmslt.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmslt.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsgt.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgt.mask.nxv1i64(
 define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgt.mask.nxv2i64(
 define <vscale x 2 x i1> @intrinsic_vmsgt_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmslt.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v10, v8
+; CHECK-NEXT:    vmslt.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmslt.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmsgt.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgt.mask.nxv4i64(
 define <vscale x 4 x i1> @intrinsic_vmsgt_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmslt.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v12, v8
+; CHECK-NEXT:    vmslt.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmslt.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsgt.nxv4i64(
@@ -1692,11 +1674,11 @@ define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vmslt.vv v10, v11, v8, v0.t
-; RV32-NEXT:    vmv.v.v v0, v10
+; RV32-NEXT:    vmslt.vv v11, v10, v8, v0.t
+; RV32-NEXT:    vmv.v.v v0, v11
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll
index 88a632de067a68..37a022ef789a84 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgtu.mask.nxv1i8(
 define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgtu.mask.nxv2i8(
 define <vscale x 2 x i1> @intrinsic_vmsgtu_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgtu.mask.nxv4i8(
 define <vscale x 4 x i1> @intrinsic_vmsgtu_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgtu.mask.nxv8i8(
 define <vscale x 8 x i1> @intrinsic_vmsgtu_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsgtu.mask.nxv16i8(
 define <vscale x 16 x i1> @intrinsic_vmsgtu_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v10, v8
+; CHECK-NEXT:    vmsltu.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsltu.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsgtu.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsgtu.mask.nxv32i8(
 define <vscale x 32 x i1> @intrinsic_vmsgtu_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v12, v8
+; CHECK-NEXT:    vmsltu.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsltu.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 32 x i1> @llvm.riscv.vmsgtu.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgtu.mask.nxv1i16(
 define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgtu.mask.nxv2i16(
 define <vscale x 2 x i1> @intrinsic_vmsgtu_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgtu.mask.nxv4i16(
 define <vscale x 4 x i1> @intrinsic_vmsgtu_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgtu.mask.nxv8i16(
 define <vscale x 8 x i1> @intrinsic_vmsgtu_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v10, v8
+; CHECK-NEXT:    vmsltu.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsltu.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsgtu.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsgtu.mask.nxv16i16(
 define <vscale x 16 x i1> @intrinsic_vmsgtu_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v12, v8
+; CHECK-NEXT:    vmsltu.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsltu.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsgtu.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgtu.mask.nxv1i32(
 define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgtu.mask.nxv2i32(
 define <vscale x 2 x i1> @intrinsic_vmsgtu_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgtu.mask.nxv4i32(
 define <vscale x 4 x i1> @intrinsic_vmsgtu_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v10, v8
+; CHECK-NEXT:    vmsltu.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsltu.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsgtu.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgtu.mask.nxv8i32(
 define <vscale x 8 x i1> @intrinsic_vmsgtu_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v12, v8
+; CHECK-NEXT:    vmsltu.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsltu.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsgtu.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgtu.mask.nxv1i64(
 define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgtu.mask.nxv2i64(
 define <vscale x 2 x i1> @intrinsic_vmsgtu_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v10, v8
+; CHECK-NEXT:    vmsltu.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsltu.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmsgtu.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgtu.mask.nxv4i64(
 define <vscale x 4 x i1> @intrinsic_vmsgtu_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v12, v8
+; CHECK-NEXT:    vmsltu.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsltu.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsgtu.nxv4i64(
@@ -1692,11 +1674,11 @@ define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vmsltu.vv v10, v11, v8, v0.t
-; RV32-NEXT:    vmv.v.v v0, v10
+; RV32-NEXT:    vmsltu.vv v11, v10, v8, v0.t
+; RV32-NEXT:    vmv.v.v v0, v11
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsle.ll b/llvm/test/CodeGen/RISCV/rvv/vmsle.ll
index 2248ba03adfe79..cac4cbe0b2fe82 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsle.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsle.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsle.mask.nxv1i8(
 define <vscale x 1 x i1> @intrinsic_vmsle_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsle.mask.nxv2i8(
 define <vscale x 2 x i1> @intrinsic_vmsle_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsle.mask.nxv4i8(
 define <vscale x 4 x i1> @intrinsic_vmsle_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsle.mask.nxv8i8(
 define <vscale x 8 x i1> @intrinsic_vmsle_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsle.mask.nxv16i8(
 define <vscale x 16 x i1> @intrinsic_vmsle_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vmsle.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v8, v10
+; CHECK-NEXT:    vmsle.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsle.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsle.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsle.mask.nxv32i8(
 define <vscale x 32 x i1> @intrinsic_vmsle_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vmsle.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v8, v12
+; CHECK-NEXT:    vmsle.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsle.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 32 x i1> @llvm.riscv.vmsle.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsle.mask.nxv1i16(
 define <vscale x 1 x i1> @intrinsic_vmsle_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsle.mask.nxv2i16(
 define <vscale x 2 x i1> @intrinsic_vmsle_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsle.mask.nxv4i16(
 define <vscale x 4 x i1> @intrinsic_vmsle_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsle.mask.nxv8i16(
 define <vscale x 8 x i1> @intrinsic_vmsle_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmsle.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v8, v10
+; CHECK-NEXT:    vmsle.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsle.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsle.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsle.mask.nxv16i16(
 define <vscale x 16 x i1> @intrinsic_vmsle_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmsle.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v8, v12
+; CHECK-NEXT:    vmsle.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsle.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsle.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsle.mask.nxv1i32(
 define <vscale x 1 x i1> @intrinsic_vmsle_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsle.mask.nxv2i32(
 define <vscale x 2 x i1> @intrinsic_vmsle_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsle.mask.nxv4i32(
 define <vscale x 4 x i1> @intrinsic_vmsle_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmsle.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v8, v10
+; CHECK-NEXT:    vmsle.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsle.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsle.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsle.mask.nxv8i32(
 define <vscale x 8 x i1> @intrinsic_vmsle_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmsle.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v8, v12
+; CHECK-NEXT:    vmsle.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsle.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsle.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsle.mask.nxv1i64(
 define <vscale x 1 x i1> @intrinsic_vmsle_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsle.mask.nxv2i64(
 define <vscale x 2 x i1> @intrinsic_vmsle_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmsle.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v8, v10
+; CHECK-NEXT:    vmsle.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsle.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmsle.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsle.mask.nxv4i64(
 define <vscale x 4 x i1> @intrinsic_vmsle_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmsle.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v8, v12
+; CHECK-NEXT:    vmsle.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsle.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsle.nxv4i64(
@@ -1692,11 +1674,11 @@ define <vscale x 1 x i1> @intrinsic_vmsle_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vmsle.vv v10, v8, v11, v0.t
-; RV32-NEXT:    vmv.v.v v0, v10
+; RV32-NEXT:    vmsle.vv v11, v8, v10, v0.t
+; RV32-NEXT:    vmv.v.v v0, v11
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll
index 57bae83b25e0e5..75dc38f1333d2e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsleu.mask.nxv1i8(
 define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsleu.mask.nxv2i8(
 define <vscale x 2 x i1> @intrinsic_vmsleu_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsleu.mask.nxv4i8(
 define <vscale x 4 x i1> @intrinsic_vmsleu_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsleu.mask.nxv8i8(
 define <vscale x 8 x i1> @intrinsic_vmsleu_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsleu.mask.nxv16i8(
 define <vscale x 16 x i1> @intrinsic_vmsleu_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v8, v10
+; CHECK-NEXT:    vmsleu.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsleu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsleu.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsleu.mask.nxv32i8(
 define <vscale x 32 x i1> @intrinsic_vmsleu_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v8, v12
+; CHECK-NEXT:    vmsleu.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsleu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 32 x i1> @llvm.riscv.vmsleu.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsleu.mask.nxv1i16(
 define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsleu.mask.nxv2i16(
 define <vscale x 2 x i1> @intrinsic_vmsleu_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsleu.mask.nxv4i16(
 define <vscale x 4 x i1> @intrinsic_vmsleu_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsleu.mask.nxv8i16(
 define <vscale x 8 x i1> @intrinsic_vmsleu_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v8, v10
+; CHECK-NEXT:    vmsleu.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsleu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsleu.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsleu.mask.nxv16i16(
 define <vscale x 16 x i1> @intrinsic_vmsleu_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v8, v12
+; CHECK-NEXT:    vmsleu.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsleu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsleu.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsleu.mask.nxv1i32(
 define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsleu.mask.nxv2i32(
 define <vscale x 2 x i1> @intrinsic_vmsleu_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsleu.mask.nxv4i32(
 define <vscale x 4 x i1> @intrinsic_vmsleu_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v8, v10
+; CHECK-NEXT:    vmsleu.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsleu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsleu.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsleu.mask.nxv8i32(
 define <vscale x 8 x i1> @intrinsic_vmsleu_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v8, v12
+; CHECK-NEXT:    vmsleu.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsleu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsleu.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsleu.mask.nxv1i64(
 define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsleu.mask.nxv2i64(
 define <vscale x 2 x i1> @intrinsic_vmsleu_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v8, v10
+; CHECK-NEXT:    vmsleu.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsleu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmsleu.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsleu.mask.nxv4i64(
 define <vscale x 4 x i1> @intrinsic_vmsleu_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v8, v12
+; CHECK-NEXT:    vmsleu.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsleu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsleu.nxv4i64(
@@ -1692,11 +1674,11 @@ define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vmsleu.vv v10, v8, v11, v0.t
-; RV32-NEXT:    vmv.v.v v0, v10
+; RV32-NEXT:    vmsleu.vv v11, v8, v10, v0.t
+; RV32-NEXT:    vmv.v.v v0, v11
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmslt.ll b/llvm/test/CodeGen/RISCV/rvv/vmslt.ll
index 6783f7feb624c5..2efbe466149ba8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmslt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmslt.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i8(
 define <vscale x 1 x i1> @intrinsic_vmslt_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i8(
 define <vscale x 2 x i1> @intrinsic_vmslt_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i8(
 define <vscale x 4 x i1> @intrinsic_vmslt_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmslt.mask.nxv8i8(
 define <vscale x 8 x i1> @intrinsic_vmslt_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmslt.mask.nxv16i8(
 define <vscale x 16 x i1> @intrinsic_vmslt_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vmslt.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v8, v10
+; CHECK-NEXT:    vmslt.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmslt.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmslt.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmslt.mask.nxv32i8(
 define <vscale x 32 x i1> @intrinsic_vmslt_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vmslt.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v8, v12
+; CHECK-NEXT:    vmslt.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmslt.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 32 x i1> @llvm.riscv.vmslt.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i16(
 define <vscale x 1 x i1> @intrinsic_vmslt_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i16(
 define <vscale x 2 x i1> @intrinsic_vmslt_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i16(
 define <vscale x 4 x i1> @intrinsic_vmslt_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmslt.mask.nxv8i16(
 define <vscale x 8 x i1> @intrinsic_vmslt_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmslt.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v8, v10
+; CHECK-NEXT:    vmslt.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmslt.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmslt.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmslt.mask.nxv16i16(
 define <vscale x 16 x i1> @intrinsic_vmslt_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmslt.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v8, v12
+; CHECK-NEXT:    vmslt.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmslt.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmslt.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i32(
 define <vscale x 1 x i1> @intrinsic_vmslt_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i32(
 define <vscale x 2 x i1> @intrinsic_vmslt_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i32(
 define <vscale x 4 x i1> @intrinsic_vmslt_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmslt.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v8, v10
+; CHECK-NEXT:    vmslt.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmslt.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmslt.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmslt.mask.nxv8i32(
 define <vscale x 8 x i1> @intrinsic_vmslt_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmslt.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v8, v12
+; CHECK-NEXT:    vmslt.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmslt.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmslt.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i64(
 define <vscale x 1 x i1> @intrinsic_vmslt_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i64(
 define <vscale x 2 x i1> @intrinsic_vmslt_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmslt.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v8, v10
+; CHECK-NEXT:    vmslt.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmslt.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmslt.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i64(
 define <vscale x 4 x i1> @intrinsic_vmslt_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmslt.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmslt.vv v0, v8, v12
+; CHECK-NEXT:    vmslt.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmslt.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmslt.nxv4i64(
@@ -1692,11 +1674,11 @@ define <vscale x 1 x i1> @intrinsic_vmslt_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vmslt.vv v10, v8, v11, v0.t
-; RV32-NEXT:    vmv.v.v v0, v10
+; RV32-NEXT:    vmslt.vv v11, v8, v10, v0.t
+; RV32-NEXT:    vmv.v.v v0, v11
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll
index b082b735a02072..c344dff766bdbc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i8(
 define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i8(
 define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i8(
 define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsltu.mask.nxv8i8(
 define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsltu.mask.nxv16i8(
 define <vscale x 16 x i1> @intrinsic_vmsltu_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v8, v10
+; CHECK-NEXT:    vmsltu.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsltu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsltu.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsltu.mask.nxv32i8(
 define <vscale x 32 x i1> @intrinsic_vmsltu_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v8, v12
+; CHECK-NEXT:    vmsltu.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsltu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 32 x i1> @llvm.riscv.vmsltu.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i16(
 define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i16(
 define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i16(
 define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsltu.mask.nxv8i16(
 define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v8, v10
+; CHECK-NEXT:    vmsltu.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsltu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsltu.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsltu.mask.nxv16i16(
 define <vscale x 16 x i1> @intrinsic_vmsltu_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v8, v12
+; CHECK-NEXT:    vmsltu.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsltu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsltu.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i32(
 define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i32(
 define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i32(
 define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v8, v10
+; CHECK-NEXT:    vmsltu.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsltu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsltu.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsltu.mask.nxv8i32(
 define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v8, v12
+; CHECK-NEXT:    vmsltu.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsltu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsltu.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i64(
 define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i64(
 define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v8, v10
+; CHECK-NEXT:    vmsltu.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsltu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmsltu.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i64(
 define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsltu.vv v0, v8, v12
+; CHECK-NEXT:    vmsltu.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsltu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsltu.nxv4i64(
@@ -1692,11 +1674,11 @@ define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vmsltu.vv v10, v8, v11, v0.t
-; RV32-NEXT:    vmv.v.v v0, v10
+; RV32-NEXT:    vmsltu.vv v11, v8, v10, v0.t
+; RV32-NEXT:    vmv.v.v v0, v11
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsne.ll b/llvm/test/CodeGen/RISCV/rvv/vmsne.ll
index bb4575e5d72cbe..0c34dd73e0e803 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsne.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsne.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsne.mask.nxv1i8(
 define <vscale x 1 x i1> @intrinsic_vmsne_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsne.mask.nxv2i8(
 define <vscale x 2 x i1> @intrinsic_vmsne_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsne.mask.nxv4i8(
 define <vscale x 4 x i1> @intrinsic_vmsne_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsne.mask.nxv8i8(
 define <vscale x 8 x i1> @intrinsic_vmsne_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsne.mask.nxv16i8(
 define <vscale x 16 x i1> @intrinsic_vmsne_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vmsne.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsne.vv v0, v8, v10
+; CHECK-NEXT:    vmsne.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsne.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsne.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsne.mask.nxv32i8(
 define <vscale x 32 x i1> @intrinsic_vmsne_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vmsne.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsne.vv v0, v8, v12
+; CHECK-NEXT:    vmsne.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsne.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 32 x i1> @llvm.riscv.vmsne.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsne.mask.nxv1i16(
 define <vscale x 1 x i1> @intrinsic_vmsne_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsne.mask.nxv2i16(
 define <vscale x 2 x i1> @intrinsic_vmsne_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsne.mask.nxv4i16(
 define <vscale x 4 x i1> @intrinsic_vmsne_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsne.mask.nxv8i16(
 define <vscale x 8 x i1> @intrinsic_vmsne_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmsne.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsne.vv v0, v8, v10
+; CHECK-NEXT:    vmsne.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsne.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsne.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsne.mask.nxv16i16(
 define <vscale x 16 x i1> @intrinsic_vmsne_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmsne.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsne.vv v0, v8, v12
+; CHECK-NEXT:    vmsne.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsne.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsne.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsne.mask.nxv1i32(
 define <vscale x 1 x i1> @intrinsic_vmsne_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsne.mask.nxv2i32(
 define <vscale x 2 x i1> @intrinsic_vmsne_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsne.mask.nxv4i32(
 define <vscale x 4 x i1> @intrinsic_vmsne_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmsne.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsne.vv v0, v8, v10
+; CHECK-NEXT:    vmsne.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsne.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsne.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsne.mask.nxv8i32(
 define <vscale x 8 x i1> @intrinsic_vmsne_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmsne.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsne.vv v0, v8, v12
+; CHECK-NEXT:    vmsne.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsne.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsne.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsne.mask.nxv1i64(
 define <vscale x 1 x i1> @intrinsic_vmsne_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsne.mask.nxv2i64(
 define <vscale x 2 x i1> @intrinsic_vmsne_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmsne.vv v14, v8, v10
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsne.vv v0, v8, v10
+; CHECK-NEXT:    vmsne.vv v14, v10, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsne.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmsne.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsne.mask.nxv4i64(
 define <vscale x 4 x i1> @intrinsic_vmsne_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmsne.vv v20, v8, v12
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsne.vv v0, v8, v12
+; CHECK-NEXT:    vmsne.vv v20, v12, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsne.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsne.nxv4i64(
@@ -1692,11 +1674,11 @@ define <vscale x 1 x i1> @intrinsic_vmsne_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vmsne.vv v10, v8, v11, v0.t
-; RV32-NEXT:    vmv.v.v v0, v10
+; RV32-NEXT:    vmsne.vv v11, v8, v10, v0.t
+; RV32-NEXT:    vmv.v.v v0, v11
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
index 8b1660283cb7d1..d0f2ce1ca80045 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
@@ -450,14 +450,14 @@ define <vscale x 64 x i8> @test_vp_reverse_nxv64i8_masked(<vscale x 64 x i8> %sr
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vrsub.vx v24, v16, a2
+; CHECK-NEXT:    vrsub.vx v16, v16, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v20, v8, v24
-; CHECK-NEXT:    vrgatherei16.vv v16, v12, v24
+; CHECK-NEXT:    vrgatherei16.vv v28, v8, v16
+; CHECK-NEXT:    vrgatherei16.vv v24, v12, v16
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a1, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a1, v0.t
+; CHECK-NEXT:    vslidedown.vx v8, v24, a1, v0.t
 ; CHECK-NEXT:    ret
   %dst = call <vscale x 64 x i8> @llvm.experimental.vp.reverse.nxv64i8(<vscale x 64 x i8> %src, <vscale x 64 x i1> %mask, i32 %evl)
   ret <vscale x 64 x i8> %dst
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll
index a30ebf2d33b502..9b99cfe39b574d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll
@@ -7,15 +7,15 @@ define <2 x i1> @test_vp_reverse_v2i1_masked(<2 x i1> %src, <2 x i1> %mask, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v11, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    vid.v v9, v0.t
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v11, v9, v10, v0.t
-; CHECK-NEXT:    vmsne.vi v0, v11, 0, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v10, v11, v9, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
 ; CHECK-NEXT:    ret
   %dst = call <2 x i1> @llvm.experimental.vp.reverse.v2i1(<2 x i1> %src, <2 x i1> %mask, i32 %evl)
   ret <2 x i1> %dst
@@ -44,15 +44,15 @@ define <4 x i1> @test_vp_reverse_v4i1_masked(<4 x i1> %src, <4 x i1> %mask, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v11, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    vid.v v9, v0.t
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v11, v9, v10, v0.t
-; CHECK-NEXT:    vmsne.vi v0, v11, 0, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v10, v11, v9, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
 ; CHECK-NEXT:    ret
   %dst = call <4 x i1> @llvm.experimental.vp.reverse.v4i1(<4 x i1> %src, <4 x i1> %mask, i32 %evl)
   ret <4 x i1> %dst
@@ -81,15 +81,15 @@ define <8 x i1> @test_vp_reverse_v8i1_masked(<8 x i1> %src, <8 x i1> %mask, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v11, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    vid.v v9, v0.t
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v11, v9, v10, v0.t
-; CHECK-NEXT:    vmsne.vi v0, v11, 0, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v10, v11, v9, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
 ; CHECK-NEXT:    ret
   %dst = call <8 x i1> @llvm.experimental.vp.reverse.v8i1(<8 x i1> %src, <8 x i1> %mask, i32 %evl)
   ret <8 x i1> %dst
@@ -118,15 +118,15 @@ define <16 x i1> @test_vp_reverse_v16i1_masked(<16 x i1> %src, <16 x i1> %mask,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v12, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vid.v v10, v0.t
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v9, v10, v0.t
-; CHECK-NEXT:    vmsne.vi v0, v12, 0, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v9, v12, v10, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v9, 0, v0.t
 ; CHECK-NEXT:    ret
   %dst = call <16 x i1> @llvm.experimental.vp.reverse.v16i1(<16 x i1> %src, <16 x i1> %mask, i32 %evl)
   ret <16 x i1> %dst
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll
index ceb6a164e20df6..c0bba31c2086a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll
@@ -6,15 +6,15 @@ define <vscale x 1 x i1> @test_vp_reverse_nxv1i1_masked(<vscale x 1 x i1> %src,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v11, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    vid.v v9, v0.t
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v11, v9, v10, v0.t
-; CHECK-NEXT:    vmsne.vi v0, v11, 0, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v10, v11, v9, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
 ; CHECK-NEXT:    ret
   %dst = call <vscale x 1 x i1> @llvm.experimental.vp.reverse.nxv1i1(<vscale x 1 x i1> %src, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i1> %dst
@@ -43,15 +43,15 @@ define <vscale x 2 x i1> @test_vp_reverse_nxv2i1_masked(<vscale x 2 x i1> %src,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v11, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    vid.v v9, v0.t
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v11, v9, v10, v0.t
-; CHECK-NEXT:    vmsne.vi v0, v11, 0, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v10, v11, v9, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
 ; CHECK-NEXT:    ret
   %dst = call <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1> %src, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i1> %dst
@@ -80,15 +80,15 @@ define <vscale x 4 x i1> @test_vp_reverse_nxv4i1_masked(<vscale x 4 x i1> %src,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v11, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    vid.v v9, v0.t
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v11, v9, v10, v0.t
-; CHECK-NEXT:    vmsne.vi v0, v11, 0, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v10, v11, v9, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
 ; CHECK-NEXT:    ret
   %dst = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> %src, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i1> %dst
@@ -117,15 +117,15 @@ define <vscale x 8 x i1> @test_vp_reverse_nxv8i1_masked(<vscale x 8 x i1> %src,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v12, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vid.v v10, v0.t
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v9, v10, v0.t
-; CHECK-NEXT:    vmsne.vi v0, v12, 0, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v9, v12, v10, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v9, 0, v0.t
 ; CHECK-NEXT:    ret
   %dst = call <vscale x 8 x i1> @llvm.experimental.vp.reverse.nxv8i1(<vscale x 8 x i1> %src, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i1> %dst
@@ -154,15 +154,15 @@ define <vscale x 16 x i1> @test_vp_reverse_nxv16i1_masked(<vscale x 16 x i1> %sr
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vid.v v12, v0.t
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v10, v12, v0.t
-; CHECK-NEXT:    vmsne.vi v8, v16, 0, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v10, v16, v12, v0.t
+; CHECK-NEXT:    vmsne.vi v8, v10, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
   %dst = call <vscale x 16 x i1> @llvm.experimental.vp.reverse.nxv16i1(<vscale x 16 x i1> %src, <vscale x 16 x i1> %mask, i32 %evl)
@@ -192,15 +192,15 @@ define <vscale x 32 x i1> @test_vp_reverse_nxv32i1_masked(<vscale x 32 x i1> %sr
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vmerge.vim v24, v12, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vid.v v16, v0.t
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v12, v16, v0.t
-; CHECK-NEXT:    vmsne.vi v8, v24, 0, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v12, v24, v16, v0.t
+; CHECK-NEXT:    vmsne.vi v8, v12, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
   %dst = call <vscale x 32 x i1> @llvm.experimental.vp.reverse.nxv32i1(<vscale x 32 x i1> %src, <vscale x 32 x i1> %mask, i32 %evl)
@@ -228,25 +228,26 @@ define <vscale x 32 x i1> @test_vp_reverse_nxv32i1(<vscale x 32 x i1> %src, i32
 define <vscale x 64 x i1> @test_vp_reverse_nxv64i1_masked(<vscale x 64 x i1> %src, <vscale x 64 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: test_vp_reverse_nxv64i1_masked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v7, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmerge.vim v24, v16, 1, v0
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a2, a1, 2
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vrsub.vx v0, v16, a2
+; CHECK-NEXT:    vrsub.vx v24, v16, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v20, v24, v0
-; CHECK-NEXT:    vrgatherei16.vv v16, v28, v0
+; CHECK-NEXT:    vrgatherei16.vv v20, v8, v24
+; CHECK-NEXT:    vrgatherei16.vv v16, v12, v24
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a1, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vslidedown.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vmsne.vi v8, v16, 0, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vslidedown.vx v8, v16, a1, v0.t
+; CHECK-NEXT:    vmsne.vi v16, v8, 0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
   %dst = call <vscale x 64 x i1> @llvm.experimental.vp.reverse.nxv64i1(<vscale x 64 x i1> %src, <vscale x 64 x i1> %mask, i32 %evl)
   ret <vscale x 64 x i1> %dst
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll
index ce0ae2022885ac..b0ea1fa5911598 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll
@@ -10,15 +10,15 @@ declare <16 x i1> @llvm.experimental.vp.splice.v16i1(<16 x i1>, <16 x i1>, i32,
 define <2 x i1> @test_vp_splice_v2i1(<2 x i1> %va, <2 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v2i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v9, 5
@@ -34,15 +34,15 @@ define <2 x i1> @test_vp_splice_v2i1(<2 x i1> %va, <2 x i1> %vb, i32 zeroext %ev
 define <2 x i1> @test_vp_splice_v2i1_negative_offset(<2 x i1> %va, <2 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v2i1_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetivli zero, 5, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v9, v9, a0
@@ -58,23 +58,23 @@ define <2 x i1> @test_vp_splice_v2i1_negative_offset(<2 x i1> %va, <2 x i1> %vb,
 define <2 x i1> @test_vp_splice_v2i1_masked(<2 x i1> %va, <2 x i1> %vb, <2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v2i1_masked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vslidedown.vi v9, v10, 5, v0.t
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vslideup.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v9, 0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <2 x i1> @llvm.experimental.vp.splice.v2i1(<2 x i1> %va, <2 x i1> %vb, i32 5, <2 x i1> %mask, i32 %evla, i32 %evlb)
   ret <2 x i1> %v
@@ -83,15 +83,15 @@ define <2 x i1> @test_vp_splice_v2i1_masked(<2 x i1> %va, <2 x i1> %vb, <2 x i1>
 define <4 x i1> @test_vp_splice_v4i1(<4 x i1> %va, <4 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v4i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v9, 5
@@ -107,15 +107,15 @@ define <4 x i1> @test_vp_splice_v4i1(<4 x i1> %va, <4 x i1> %vb, i32 zeroext %ev
 define <4 x i1> @test_vp_splice_v4i1_negative_offset(<4 x i1> %va, <4 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v4i1_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetivli zero, 5, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v9, v9, a0
@@ -131,23 +131,23 @@ define <4 x i1> @test_vp_splice_v4i1_negative_offset(<4 x i1> %va, <4 x i1> %vb,
 define <4 x i1> @test_vp_splice_v4i1_masked(<4 x i1> %va, <4 x i1> %vb, <4 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v4i1_masked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vslidedown.vi v9, v10, 5, v0.t
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vslideup.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v9, 0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <4 x i1> @llvm.experimental.vp.splice.v4i1(<4 x i1> %va, <4 x i1> %vb, i32 5, <4 x i1> %mask, i32 %evla, i32 %evlb)
   ret <4 x i1> %v
@@ -156,15 +156,15 @@ define <4 x i1> @test_vp_splice_v4i1_masked(<4 x i1> %va, <4 x i1> %vb, <4 x i1>
 define <8 x i1> @test_vp_splice_v8i1(<8 x i1> %va, <8 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v8i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v9, 5
@@ -180,15 +180,15 @@ define <8 x i1> @test_vp_splice_v8i1(<8 x i1> %va, <8 x i1> %vb, i32 zeroext %ev
 define <8 x i1> @test_vp_splice_v8i1_negative_offset(<8 x i1> %va, <8 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v8i1_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetivli zero, 5, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v9, v9, a0
@@ -204,23 +204,23 @@ define <8 x i1> @test_vp_splice_v8i1_negative_offset(<8 x i1> %va, <8 x i1> %vb,
 define <8 x i1> @test_vp_splice_v8i1_masked(<8 x i1> %va, <8 x i1> %vb, <8 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v8i1_masked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vslidedown.vi v9, v10, 5, v0.t
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vslideup.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v9, 0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i1> @llvm.experimental.vp.splice.v8i1(<8 x i1> %va, <8 x i1> %vb, i32 5, <8 x i1> %mask, i32 %evla, i32 %evlb)
   ret <8 x i1> %v
@@ -229,15 +229,15 @@ define <8 x i1> @test_vp_splice_v8i1_masked(<8 x i1> %va, <8 x i1> %vb, <8 x i1>
 define <16 x i1> @test_vp_splice_v16i1(<16 x i1> %va, <16 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v16i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v9, 5
@@ -253,15 +253,15 @@ define <16 x i1> @test_vp_splice_v16i1(<16 x i1> %va, <16 x i1> %vb, i32 zeroext
 define <16 x i1> @test_vp_splice_v16i1_negative_offset(<16 x i1> %va, <16 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v16i1_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetivli zero, 5, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v9, v9, a0
@@ -277,23 +277,23 @@ define <16 x i1> @test_vp_splice_v16i1_negative_offset(<16 x i1> %va, <16 x i1>
 define <16 x i1> @test_vp_splice_v16i1_masked(<16 x i1> %va, <16 x i1> %vb, <16 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v16i1_masked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vslidedown.vi v9, v10, 5, v0.t
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vslideup.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v9, 0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <16 x i1> @llvm.experimental.vp.splice.v16i1(<16 x i1> %va, <16 x i1> %vb, i32 5, <16 x i1> %mask, i32 %evla, i32 %evlb)
   ret <16 x i1> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll
index 668cff23429366..0576255fe3503c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll
@@ -13,15 +13,15 @@ declare <vscale x 64 x i1> @llvm.experimental.vp.splice.nxv64i1(<vscale x 64 x i
 define <vscale x 1 x i1> @test_vp_splice_nxv1i1(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv1i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v9, 5
@@ -37,15 +37,15 @@ define <vscale x 1 x i1> @test_vp_splice_nxv1i1(<vscale x 1 x i1> %va, <vscale x
 define <vscale x 1 x i1> @test_vp_splice_nxv1i1_negative_offset(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv1i1_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetivli zero, 5, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v9, v9, a0
@@ -61,23 +61,23 @@ define <vscale x 1 x i1> @test_vp_splice_nxv1i1_negative_offset(<vscale x 1 x i1
 define <vscale x 1 x i1> @test_vp_splice_nxv1i1_masked(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, <vscale x 1 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv1i1_masked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vslidedown.vi v9, v10, 5, v0.t
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vslideup.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v9, 0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.experimental.vp.splice.nxv1i1(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 5, <vscale x 1 x i1> %mask, i32 %evla, i32 %evlb)
   ret <vscale x 1 x i1> %v
@@ -86,15 +86,15 @@ define <vscale x 1 x i1> @test_vp_splice_nxv1i1_masked(<vscale x 1 x i1> %va, <v
 define <vscale x 2 x i1> @test_vp_splice_nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv2i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v9, 5
@@ -110,15 +110,15 @@ define <vscale x 2 x i1> @test_vp_splice_nxv2i1(<vscale x 2 x i1> %va, <vscale x
 define <vscale x 2 x i1> @test_vp_splice_nxv2i1_negative_offset(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv2i1_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetivli zero, 5, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v9, v9, a0
@@ -134,23 +134,23 @@ define <vscale x 2 x i1> @test_vp_splice_nxv2i1_negative_offset(<vscale x 2 x i1
 define <vscale x 2 x i1> @test_vp_splice_nxv2i1_masked(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv2i1_masked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vslidedown.vi v9, v10, 5, v0.t
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vslideup.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v9, 0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i1> @llvm.experimental.vp.splice.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
   ret <vscale x 2 x i1> %v
@@ -159,15 +159,15 @@ define <vscale x 2 x i1> @test_vp_splice_nxv2i1_masked(<vscale x 2 x i1> %va, <v
 define <vscale x 4 x i1> @test_vp_splice_nxv4i1(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv4i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v9, 5
@@ -183,15 +183,15 @@ define <vscale x 4 x i1> @test_vp_splice_nxv4i1(<vscale x 4 x i1> %va, <vscale x
 define <vscale x 4 x i1> @test_vp_splice_nxv4i1_negative_offset(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv4i1_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetivli zero, 5, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v9, v9, a0
@@ -207,23 +207,23 @@ define <vscale x 4 x i1> @test_vp_splice_nxv4i1_negative_offset(<vscale x 4 x i1
 define <vscale x 4 x i1> @test_vp_splice_nxv4i1_masked(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, <vscale x 4 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv4i1_masked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vslidedown.vi v9, v10, 5, v0.t
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vslideup.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v9, 0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x i1> @llvm.experimental.vp.splice.nxv4i1(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 5, <vscale x 4 x i1> %mask, i32 %evla, i32 %evlb)
   ret <vscale x 4 x i1> %v
@@ -232,15 +232,15 @@ define <vscale x 4 x i1> @test_vp_splice_nxv4i1_masked(<vscale x 4 x i1> %va, <v
 define <vscale x 8 x i1> @test_vp_splice_nxv8i1(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv8i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v9, 5
@@ -256,15 +256,15 @@ define <vscale x 8 x i1> @test_vp_splice_nxv8i1(<vscale x 8 x i1> %va, <vscale x
 define <vscale x 8 x i1> @test_vp_splice_nxv8i1_negative_offset(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv8i1_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetivli zero, 5, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v9, v9, a0
@@ -280,23 +280,23 @@ define <vscale x 8 x i1> @test_vp_splice_nxv8i1_negative_offset(<vscale x 8 x i1
 define <vscale x 8 x i1> @test_vp_splice_nxv8i1_masked(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, <vscale x 8 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv8i1_masked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v11, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vslidedown.vi v9, v10, 5, v0.t
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vslideup.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v9, 0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x i1> @llvm.experimental.vp.splice.nxv8i1(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 5, <vscale x 8 x i1> %mask, i32 %evla, i32 %evlb)
   ret <vscale x 8 x i1> %v
@@ -305,21 +305,21 @@ define <vscale x 8 x i1> @test_vp_splice_nxv8i1_masked(<vscale x 8 x i1> %va, <v
 define <vscale x 16 x i1> @test_vp_splice_nxv16i1(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv16i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v10, a0
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vslideup.vx v10, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
 ; CHECK-NEXT:    ret
 
   %v = call <vscale x 16 x i1> @llvm.experimental.vp.splice.nxv16i1(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 5, <vscale x 16 x i1> splat (i1 1), i32 %evla, i32 %evlb)
@@ -329,21 +329,21 @@ define <vscale x 16 x i1> @test_vp_splice_nxv16i1(<vscale x 16 x i1> %va, <vscal
 define <vscale x 16 x i1> @test_vp_splice_nxv16i1_negative_offset(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv16i1_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetivli zero, 5, e8, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslidedown.vx v10, v10, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 5
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
 ; CHECK-NEXT:    ret
 
   %v = call <vscale x 16 x i1> @llvm.experimental.vp.splice.nxv16i1(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 -5, <vscale x 16 x i1> splat (i1 1), i32 %evla, i32 %evlb)
@@ -353,15 +353,15 @@ define <vscale x 16 x i1> @test_vp_splice_nxv16i1_negative_offset(<vscale x 16 x
 define <vscale x 16 x i1> @test_vp_splice_nxv16i1_masked(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, <vscale x 16 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv16i1_masked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v14, 0
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vim v10, v14, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v14
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -379,21 +379,21 @@ define <vscale x 16 x i1> @test_vp_splice_nxv16i1_masked(<vscale x 16 x i1> %va,
 define <vscale x 32 x i1> @test_vp_splice_nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv32i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vslidedown.vi v12, v12, 5
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v12, a0
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vslideup.vx v12, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    ret
 
   %v = call <vscale x 32 x i1> @llvm.experimental.vp.splice.nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 5, <vscale x 32 x i1> splat (i1 1), i32 %evla, i32 %evlb)
@@ -403,21 +403,21 @@ define <vscale x 32 x i1> @test_vp_splice_nxv32i1(<vscale x 32 x i1> %va, <vscal
 define <vscale x 32 x i1> @test_vp_splice_nxv32i1_negative_offset(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv32i1_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetivli zero, 5, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslidedown.vx v12, v12, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 5
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vslideup.vi v12, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    ret
 
   %v = call <vscale x 32 x i1> @llvm.experimental.vp.splice.nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 -5, <vscale x 32 x i1> splat (i1 1), i32 %evla, i32 %evlb)
@@ -431,19 +431,19 @@ define <vscale x 32 x i1> @test_vp_splice_nxv32i1_masked(<vscale x 32 x i1> %va,
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vslidedown.vi v16, v16, 5, v0.t
+; CHECK-NEXT:    vslidedown.vi v12, v12, 5, v0.t
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vslideup.vx v16, v12, a0, v0.t
+; CHECK-NEXT:    vslideup.vx v12, v16, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmsne.vi v8, v16, 0, v0.t
+; CHECK-NEXT:    vmsne.vi v8, v12, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i1> @llvm.experimental.vp.splice.nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 5, <vscale x 32 x i1> %mask, i32 %evla, i32 %evlb)
@@ -453,21 +453,21 @@ define <vscale x 32 x i1> @test_vp_splice_nxv32i1_masked(<vscale x 32 x i1> %va,
 define <vscale x 64 x i1> @test_vp_splice_nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv64i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v24, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vslidedown.vi v16, v16, 5
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v16, a0
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vslideup.vx v16, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v16, 0
 ; CHECK-NEXT:    ret
 
   %v = call <vscale x 64 x i1> @llvm.experimental.vp.splice.nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 5, <vscale x 64 x i1> splat (i1 1), i32 %evla, i32 %evlb)
@@ -477,21 +477,21 @@ define <vscale x 64 x i1> @test_vp_splice_nxv64i1(<vscale x 64 x i1> %va, <vscal
 define <vscale x 64 x i1> @test_vp_splice_nxv64i1_negative_offset(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv64i1_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v24, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetivli zero, 5, e8, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslidedown.vx v16, v16, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 5
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vslideup.vi v16, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v16, 0
 ; CHECK-NEXT:    ret
 
   %v = call <vscale x 64 x i1> @llvm.experimental.vp.splice.nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 -5, <vscale x 64 x i1> splat (i1 1), i32 %evla, i32 %evlb)
@@ -505,19 +505,19 @@ define <vscale x 64 x i1> @test_vp_splice_nxv64i1_masked(<vscale x 64 x i1> %va,
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vmerge.vim v24, v16, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vim v24, v24, 1, v0
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vslidedown.vi v24, v24, 5, v0.t
+; CHECK-NEXT:    vslidedown.vi v16, v16, 5, v0.t
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vslideup.vx v24, v16, a0, v0.t
+; CHECK-NEXT:    vslideup.vx v16, v24, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmsne.vi v8, v24, 0, v0.t
+; CHECK-NEXT:    vmsne.vi v8, v16, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
   %v = call <vscale x 64 x i1> @llvm.experimental.vp.splice.nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 5, <vscale x 64 x i1> %mask, i32 %evla, i32 %evlb)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
index c86fee6305931f..010b3a9159bc91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
@@ -254,7 +254,6 @@ declare <vscale x 32 x i8> @llvm.vp.gather.nxv32i8.nxv32p0(<vscale x 32 x ptr>,
 define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8> %idxs, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_nxv32i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a2, a3, 1
 ; RV32-NEXT:    sub a4, a1, a2
@@ -263,6 +262,7 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
 ; RV32-NEXT:    and a4, a5, a4
 ; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    vslidedown.vx v0, v0, a3
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vsext.vf4 v16, v10
@@ -281,7 +281,6 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
 ;
 ; RV64-LABEL: vpgather_baseidx_nxv32i8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vmv1r.v v12, v0
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a3, a2, 1
 ; RV64-NEXT:    sub a4, a1, a3
@@ -294,22 +293,24 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
 ; RV64-NEXT:    and a6, a6, a4
 ; RV64-NEXT:    srli a4, a2, 2
 ; RV64-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv1r.v v12, v0
 ; RV64-NEXT:    vslidedown.vx v13, v0, a4
 ; RV64-NEXT:    srli a4, a2, 3
 ; RV64-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmv1r.v v14, v13
 ; RV64-NEXT:    vslidedown.vx v0, v13, a4
 ; RV64-NEXT:    vsetvli a7, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v11
-; RV64-NEXT:    vsetvli zero, a6, e8, m1, ta, ma
-; RV64-NEXT:    vluxei64.v v11, (a0), v16, v0.t
 ; RV64-NEXT:    bltu a5, a2, .LBB12_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a5, a2
 ; RV64-NEXT:  .LBB12_2:
+; RV64-NEXT:    vsetvli zero, a6, e8, m1, ta, ma
+; RV64-NEXT:    vluxei64.v v11, (a0), v16, v0.t
 ; RV64-NEXT:    vsetvli a6, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v10
 ; RV64-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v13
+; RV64-NEXT:    vmv1r.v v0, v14
 ; RV64-NEXT:    vluxei64.v v10, (a0), v16, v0.t
 ; RV64-NEXT:    bltu a1, a3, .LBB12_4
 ; RV64-NEXT:  # %bb.3:
@@ -2264,7 +2265,6 @@ declare <vscale x 16 x double> @llvm.vp.gather.nxv16f64.nxv16p0(<vscale x 16 x p
 define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    sub a2, a0, a1
 ; RV32-NEXT:    sltu a3, a0, a2
@@ -2272,6 +2272,7 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    srli a3, a1, 3
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmv1r.v v24, v0
 ; RV32-NEXT:    vslidedown.vx v0, v0, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (zero), v12, v0.t
@@ -2287,7 +2288,6 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ;
 ; RV64-LABEL: vpgather_nxv16f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vmv1r.v v24, v0
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    sub a2, a0, a1
 ; RV64-NEXT:    sltu a3, a0, a2
@@ -2295,6 +2295,7 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ; RV64-NEXT:    and a2, a3, a2
 ; RV64-NEXT:    srli a3, a1, 3
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmv1r.v v24, v0
 ; RV64-NEXT:    vslidedown.vx v0, v0, a3
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (zero), v16, v0.t
@@ -2313,7 +2314,6 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vsext.vf2 v16, v8
 ; RV32-NEXT:    vsll.vi v24, v16, 3
@@ -2324,6 +2324,7 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV32-NEXT:    and a3, a4, a3
 ; RV32-NEXT:    srli a4, a2, 3
 ; RV32-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmv1r.v v8, v0
 ; RV32-NEXT:    vslidedown.vx v0, v0, a4
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
@@ -2332,18 +2333,17 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB103_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v12
+; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_nxv16i16_nxv16f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vmv1r.v v12, v0
 ; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v16, v8
 ; RV64-NEXT:    vsll.vi v24, v16, 3
 ; RV64-NEXT:    vsext.vf4 v16, v10
-; RV64-NEXT:    vsll.vi v16, v16, 3
+; RV64-NEXT:    vsll.vi v8, v16, 3
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    sub a3, a1, a2
 ; RV64-NEXT:    sltu a4, a1, a3
@@ -2351,15 +2351,16 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    srli a4, a2, 3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmv1r.v v7, v0
 ; RV64-NEXT:    vslidedown.vx v0, v0, a4
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
+; RV64-NEXT:    vluxei64.v v16, (a0), v8, v0.t
 ; RV64-NEXT:    bltu a1, a2, .LBB103_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB103_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v12
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 16 x i16> %idxs
@@ -2370,7 +2371,6 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_sext_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vsext.vf2 v16, v8
 ; RV32-NEXT:    vsll.vi v24, v16, 3
@@ -2381,6 +2381,7 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    and a3, a4, a3
 ; RV32-NEXT:    srli a4, a2, 3
 ; RV32-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmv1r.v v8, v0
 ; RV32-NEXT:    vslidedown.vx v0, v0, a4
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
@@ -2389,17 +2390,16 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB104_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v12
+; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_sext_nxv16i16_nxv16f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vmv1r.v v12, v0
 ; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v16, v10
 ; RV64-NEXT:    vsext.vf4 v24, v8
-; RV64-NEXT:    vsll.vi v24, v24, 3
+; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    sub a3, a1, a2
@@ -2408,6 +2408,7 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    srli a4, a2, 3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmv1r.v v24, v0
 ; RV64-NEXT:    vslidedown.vx v0, v0, a4
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
@@ -2416,8 +2417,8 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB104_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v12
-; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <vscale x 16 x i16> %idxs to <vscale x 16 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 16 x i64> %eidxs
@@ -2428,7 +2429,6 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vzext.vf2 v16, v8
 ; RV32-NEXT:    vsll.vi v24, v16, 3
@@ -2439,6 +2439,7 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    and a3, a4, a3
 ; RV32-NEXT:    srli a4, a2, 3
 ; RV32-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmv1r.v v8, v0
 ; RV32-NEXT:    vslidedown.vx v0, v0, a4
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
@@ -2447,13 +2448,12 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB105_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v12
+; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv16i16_nxv16f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vmv1r.v v12, v0
 ; RV64-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV64-NEXT:    vzext.vf2 v16, v8
 ; RV64-NEXT:    vsll.vi v24, v16, 3
@@ -2464,6 +2464,7 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    srli a4, a2, 3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmv1r.v v8, v0
 ; RV64-NEXT:    vslidedown.vx v0, v0, a4
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v16, (a0), v28, v0.t
@@ -2472,7 +2473,7 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB105_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v12
+; RV64-NEXT:    vmv1r.v v0, v8
 ; RV64-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 16 x i16> %idxs to <vscale x 16 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
index f07c16476c56a2..27dbd4ac1b7c88 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
@@ -437,7 +437,6 @@ declare <vscale x 16 x double> @llvm.vp.load.nxv16f64.p0(ptr, <vscale x 16 x i1>
 define <vscale x 16 x double> @vpload_nxv16f64(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpload_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v8, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    sub a3, a1, a2
 ; CHECK-NEXT:    sltu a4, a1, a3
@@ -447,6 +446,7 @@ define <vscale x 16 x double> @vpload_nxv16f64(ptr %ptr, <vscale x 16 x i1> %m,
 ; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    srli a5, a2, 3
 ; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v8, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a5
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a4), v0.t
@@ -477,37 +477,37 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
 ; CHECK-LABEL: vpload_nxv17f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a5, a3, 1
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    slli a7, a3, 1
+; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    bltu a2, a5, .LBB38_2
+; CHECK-NEXT:    bltu a2, a7, .LBB38_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a4, a5
+; CHECK-NEXT:    mv a4, a7
 ; CHECK-NEXT:  .LBB38_2:
-; CHECK-NEXT:    sub a6, a4, a3
-; CHECK-NEXT:    sltu a7, a4, a6
-; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    and a6, a7, a6
-; CHECK-NEXT:    slli a7, a3, 3
-; CHECK-NEXT:    add a7, a0, a7
+; CHECK-NEXT:    sub a5, a4, a3
+; CHECK-NEXT:    sltu a6, a4, a5
+; CHECK-NEXT:    addi a6, a6, -1
+; CHECK-NEXT:    and a5, a6, a5
+; CHECK-NEXT:    slli a6, a3, 3
+; CHECK-NEXT:    add a6, a0, a6
 ; CHECK-NEXT:    srli t0, a3, 3
 ; CHECK-NEXT:    vsetvli t1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, t0
-; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a7), v0.t
-; CHECK-NEXT:    sub a5, a2, a5
-; CHECK-NEXT:    sltu a2, a2, a5
+; CHECK-NEXT:    sub a7, a2, a7
+; CHECK-NEXT:    sltu a2, a2, a7
 ; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a5
+; CHECK-NEXT:    and a2, a2, a7
+; CHECK-NEXT:    vslidedown.vx v0, v9, t0
 ; CHECK-NEXT:    bltu a2, a3, .LBB38_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB38_4:
+; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a6), v0.t
 ; CHECK-NEXT:    slli a5, a3, 4
 ; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    srli a6, a3, 2
 ; CHECK-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a6
+; CHECK-NEXT:    vslidedown.vx v0, v9, a6
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a5), v0.t
 ; CHECK-NEXT:    bltu a4, a3, .LBB38_6
@@ -515,7 +515,7 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
 ; CHECK-NEXT:    mv a4, a3
 ; CHECK-NEXT:  .LBB38_6:
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
 ; CHECK-NEXT:    vs1r.v v24, (a1)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
index 76efdda15bf77b..5ca62befefada6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
@@ -358,11 +358,15 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    vmv8r.v v24, v16
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -372,23 +376,28 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    vl8r.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    sub a0, a3, a1
+; CHECK-NEXT:    sltu a4, a3, a0
 ; CHECK-NEXT:    vlm.v v0, (a2)
-; CHECK-NEXT:    sltu a2, a3, a0
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a0, a4, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    bltu a3, a1, .LBB28_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:  .LBB28_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -399,7 +408,7 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 define <vscale x 128 x i8> @vpmerge_vx_nxv128i8(i8 %a, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpmerge_vx_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -415,7 +424,7 @@ define <vscale x 128 x i8> @vpmerge_vx_nxv128i8(i8 %a, <vscale x 128 x i8> %vb,
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB29_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 128 x i8> poison, i8 %a, i32 0
@@ -427,7 +436,7 @@ define <vscale x 128 x i8> @vpmerge_vx_nxv128i8(i8 %a, <vscale x 128 x i8> %vb,
 define <vscale x 128 x i8> @vpmerge_vi_nxv128i8(<vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpmerge_vi_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -443,7 +452,7 @@ define <vscale x 128 x i8> @vpmerge_vi_nxv128i8(<vscale x 128 x i8> %vb, <vscale
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:  .LBB30_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vmerge.vim v8, v8, 2, v0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 128 x i8> @llvm.vp.merge.nxv128i8(<vscale x 128 x i1> %m, <vscale x 128 x i8> splat (i8 2), <vscale x 128 x i8> %vb, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
index c12fc0497742ad..4ea104d98de542 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
@@ -400,7 +400,6 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a4, a3, 1
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    mv a5, a2
 ; CHECK-NEXT:    bltu a2, a4, .LBB31_2
 ; CHECK-NEXT:  # %bb.1:
@@ -411,17 +410,8 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a6, a3
 ; CHECK-NEXT:  .LBB31_4:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a7, vlenb
-; CHECK-NEXT:    slli a7, a7, 3
-; CHECK-NEXT:    sub sp, sp, a7
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vse64.v v8, (a1), v0.t
 ; CHECK-NEXT:    sub a0, a5, a3
 ; CHECK-NEXT:    sltu a5, a5, a0
@@ -431,7 +421,8 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    add a5, a1, a5
 ; CHECK-NEXT:    srli a6, a3, 3
 ; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a6
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a6
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    sub a0, a2, a4
 ; CHECK-NEXT:    sltu a2, a2, a0
@@ -446,15 +437,9 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a3
+; CHECK-NEXT:    vslidedown.vx v0, v8, a3
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vse64.v v8, (a1), v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vse64.v v24, (a1), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.vp.store.nxv17f64.p0(<vscale x 17 x double> %val, ptr %ptr, <vscale x 17 x i1> %m, i32 %evl)
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index 4f7cb84c086443..d0549c98e1baa4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -178,7 +178,7 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; ZVFH-NEXT:    csrr a2, vlenb
 ; ZVFH-NEXT:    srli a1, a2, 1
 ; ZVFH-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; ZVFH-NEXT:    vslidedown.vx v24, v0, a1
+; ZVFH-NEXT:    vslidedown.vx v25, v0, a1
 ; ZVFH-NEXT:    slli a2, a2, 2
 ; ZVFH-NEXT:    sub a1, a0, a2
 ; ZVFH-NEXT:    sltu a3, a0, a1
@@ -189,13 +189,13 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; ZVFH-NEXT:    mv a0, a2
 ; ZVFH-NEXT:  .LBB6_2:
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFH-NEXT:    vfmv.s.f v25, fa0
+; ZVFH-NEXT:    vfmv.s.f v24, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfredusum.vs v25, v8, v25, v0.t
+; ZVFH-NEXT:    vfredusum.vs v24, v8, v24, v0.t
 ; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vmv1r.v v0, v24
-; ZVFH-NEXT:    vfredusum.vs v25, v16, v25, v0.t
-; ZVFH-NEXT:    vfmv.f.s fa0, v25
+; ZVFH-NEXT:    vmv1r.v v0, v25
+; ZVFH-NEXT:    vfredusum.vs v24, v16, v24, v0.t
+; ZVFH-NEXT:    vfmv.f.s fa0, v24
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vpreduce_fadd_nxv64f16:
@@ -284,7 +284,7 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; ZVFH-NEXT:    csrr a2, vlenb
 ; ZVFH-NEXT:    srli a1, a2, 1
 ; ZVFH-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; ZVFH-NEXT:    vslidedown.vx v24, v0, a1
+; ZVFH-NEXT:    vslidedown.vx v25, v0, a1
 ; ZVFH-NEXT:    slli a2, a2, 2
 ; ZVFH-NEXT:    sub a1, a0, a2
 ; ZVFH-NEXT:    sltu a3, a0, a1
@@ -295,13 +295,13 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; ZVFH-NEXT:    mv a0, a2
 ; ZVFH-NEXT:  .LBB7_2:
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFH-NEXT:    vfmv.s.f v25, fa0
+; ZVFH-NEXT:    vfmv.s.f v24, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfredosum.vs v25, v8, v25, v0.t
+; ZVFH-NEXT:    vfredosum.vs v24, v8, v24, v0.t
 ; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vmv1r.v v0, v24
-; ZVFH-NEXT:    vfredosum.vs v25, v16, v25, v0.t
-; ZVFH-NEXT:    vfmv.f.s fa0, v25
+; ZVFH-NEXT:    vmv1r.v v0, v25
+; ZVFH-NEXT:    vfredosum.vs v24, v16, v24, v0.t
+; ZVFH-NEXT:    vfmv.f.s fa0, v24
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vpreduce_ord_fadd_nxv64f16:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
index 618e169e1f9654..0722beb45e30a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
@@ -1151,7 +1151,7 @@ define signext i32 @vpreduce_umax_nxv32i32(i32 signext %s, <vscale x 32 x i32> %
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    srli a2, a3, 2
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vx v24, v0, a2
+; RV32-NEXT:    vslidedown.vx v25, v0, a2
 ; RV32-NEXT:    slli a3, a3, 1
 ; RV32-NEXT:    sub a2, a1, a3
 ; RV32-NEXT:    sltu a4, a1, a2
@@ -1162,13 +1162,13 @@ define signext i32 @vpreduce_umax_nxv32i32(i32 signext %s, <vscale x 32 x i32> %
 ; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:  .LBB67_2:
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.s.x v25, a0
+; RV32-NEXT:    vmv.s.x v24, a0
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT:    vredmaxu.vs v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v24
-; RV32-NEXT:    vredmaxu.vs v25, v16, v25, v0.t
-; RV32-NEXT:    vmv.x.s a0, v25
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vredmaxu.vs v24, v16, v24, v0.t
+; RV32-NEXT:    vmv.x.s a0, v24
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpreduce_umax_nxv32i32:
@@ -1176,25 +1176,25 @@ define signext i32 @vpreduce_umax_nxv32i32(i32 signext %s, <vscale x 32 x i32> %
 ; RV64-NEXT:    csrr a3, vlenb
 ; RV64-NEXT:    srli a2, a3, 2
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v24, v0, a2
-; RV64-NEXT:    andi a2, a0, -1
+; RV64-NEXT:    vslidedown.vx v25, v0, a2
+; RV64-NEXT:    andi a0, a0, -1
 ; RV64-NEXT:    slli a3, a3, 1
-; RV64-NEXT:    sub a0, a1, a3
-; RV64-NEXT:    sltu a4, a1, a0
+; RV64-NEXT:    sub a2, a1, a3
+; RV64-NEXT:    sltu a4, a1, a2
 ; RV64-NEXT:    addi a4, a4, -1
-; RV64-NEXT:    and a0, a4, a0
+; RV64-NEXT:    and a2, a4, a2
 ; RV64-NEXT:    bltu a1, a3, .LBB67_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a1, a3
 ; RV64-NEXT:  .LBB67_2:
 ; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64-NEXT:    vmv.s.x v25, a2
+; RV64-NEXT:    vmv.s.x v24, a0
 ; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV64-NEXT:    vredmaxu.vs v25, v8, v25, v0.t
-; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vredmaxu.vs v25, v16, v25, v0.t
-; RV64-NEXT:    vmv.x.s a0, v25
+; RV64-NEXT:    vredmaxu.vs v24, v8, v24, v0.t
+; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v25
+; RV64-NEXT:    vredmaxu.vs v24, v16, v24, v0.t
+; RV64-NEXT:    vmv.x.s a0, v24
 ; RV64-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.umax.nxv32i32(i32 %s, <vscale x 32 x i32> %v, <vscale x 32 x i1> %m, i32 %evl)
   ret i32 %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
index f9ea5143cfcb70..5adf48f18f9996 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
@@ -572,7 +572,7 @@ declare <vscale x 128 x i8> @llvm.vp.sadd.sat.nxv128i8(<vscale x 128 x i8>, <vsc
 define <vscale x 128 x i8> @vsadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vsadd_vi_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -588,7 +588,7 @@ define <vscale x 128 x i8> @vsadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:  .LBB50_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 128 x i8> @llvm.vp.sadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> splat (i8 -1), <vscale x 128 x i1> %m, i32 %evl)
@@ -1350,10 +1350,10 @@ declare <vscale x 32 x i32> @llvm.vp.sadd.sat.nxv32i32(<vscale x 32 x i32>, <vsc
 define <vscale x 32 x i32> @vsadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vsadd_vi_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -1367,7 +1367,7 @@ define <vscale x 32 x i32> @vsadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB118_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i32> @llvm.vp.sadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> splat (i32 -1), <vscale x 32 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
index 745b93b257085a..7581e3dc1eff51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
@@ -571,7 +571,7 @@ declare <vscale x 128 x i8> @llvm.vp.uadd.sat.nxv128i8(<vscale x 128 x i8>, <vsc
 define <vscale x 128 x i8> @vsaddu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vsaddu_vi_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -587,7 +587,7 @@ define <vscale x 128 x i8> @vsaddu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:  .LBB50_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 128 x i8> @llvm.vp.uadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> splat (i8 -1), <vscale x 128 x i1> %m, i32 %evl)
@@ -1349,10 +1349,10 @@ declare <vscale x 32 x i32> @llvm.vp.uadd.sat.nxv32i32(<vscale x 32 x i32>, <vsc
 define <vscale x 32 x i32> @vsaddu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vsaddu_vi_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -1366,7 +1366,7 @@ define <vscale x 32 x i32> @vsaddu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB118_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i32> @llvm.vp.uadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> splat (i32 -1), <vscale x 32 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index a14ce717261536..97455786b82d60 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -152,45 +152,45 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 ; NO_FOLDING-LABEL: vwop_vscale_sext_i1i32_multiple_users:
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
-; NO_FOLDING-NEXT:    vlm.v v8, (a0)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
-; NO_FOLDING-NEXT:    vlm.v v10, (a2)
-; NO_FOLDING-NEXT:    vmv.v.i v11, 0
-; NO_FOLDING-NEXT:    vmv.v.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
-; NO_FOLDING-NEXT:    vmv.v.v v0, v9
-; NO_FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a0)
+; NO_FOLDING-NEXT:    vlm.v v10, (a1)
+; NO_FOLDING-NEXT:    vlm.v v12, (a2)
+; NO_FOLDING-NEXT:    vmv.v.i v8, 0
+; NO_FOLDING-NEXT:    vmerge.vim v9, v8, -1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v11, v0
 ; NO_FOLDING-NEXT:    vmv.v.v v0, v10
-; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
-; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
+; NO_FOLDING-NEXT:    vmerge.vim v10, v8, -1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v12
+; NO_FOLDING-NEXT:    vmerge.vim v8, v8, -1, v0
+; NO_FOLDING-NEXT:    vmul.vv v10, v9, v10
 ; NO_FOLDING-NEXT:    li a0, 1
-; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
-; NO_FOLDING-NEXT:    vmv.v.v v0, v8
-; NO_FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
-; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
+; NO_FOLDING-NEXT:    vsub.vv v9, v9, v8
+; NO_FOLDING-NEXT:    vmv.v.v v0, v11
+; NO_FOLDING-NEXT:    vsub.vx v8, v8, a0, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v9
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_sext_i1i32_multiple_users:
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
-; FOLDING-NEXT:    vlm.v v8, (a0)
-; FOLDING-NEXT:    vlm.v v9, (a1)
-; FOLDING-NEXT:    vlm.v v10, (a2)
-; FOLDING-NEXT:    vmv.v.i v11, 0
-; FOLDING-NEXT:    vmv.v.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
-; FOLDING-NEXT:    vmv.v.v v0, v9
-; FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
+; FOLDING-NEXT:    vlm.v v0, (a0)
+; FOLDING-NEXT:    vlm.v v10, (a1)
+; FOLDING-NEXT:    vlm.v v12, (a2)
+; FOLDING-NEXT:    vmv.v.i v8, 0
+; FOLDING-NEXT:    vmerge.vim v9, v8, -1, v0
+; FOLDING-NEXT:    vmv.v.v v11, v0
 ; FOLDING-NEXT:    vmv.v.v v0, v10
-; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
-; FOLDING-NEXT:    vmul.vv v9, v12, v9
+; FOLDING-NEXT:    vmerge.vim v10, v8, -1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v12
+; FOLDING-NEXT:    vmerge.vim v8, v8, -1, v0
+; FOLDING-NEXT:    vmul.vv v10, v9, v10
 ; FOLDING-NEXT:    li a0, 1
-; FOLDING-NEXT:    vsub.vv v11, v12, v10
-; FOLDING-NEXT:    vmv.v.v v0, v8
-; FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
-; FOLDING-NEXT:    vor.vv v8, v9, v10
-; FOLDING-NEXT:    vor.vv v8, v8, v11
+; FOLDING-NEXT:    vsub.vv v9, v9, v8
+; FOLDING-NEXT:    vmv.v.v v0, v11
+; FOLDING-NEXT:    vsub.vx v8, v8, a0, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v8
+; FOLDING-NEXT:    vor.vv v8, v8, v9
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
@@ -210,45 +210,45 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users:
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
-; NO_FOLDING-NEXT:    vlm.v v8, (a0)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
-; NO_FOLDING-NEXT:    vlm.v v10, (a2)
-; NO_FOLDING-NEXT:    vmv.v.i v11, 0
-; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
-; NO_FOLDING-NEXT:    vmv1r.v v0, v9
-; NO_FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a0)
+; NO_FOLDING-NEXT:    vlm.v v10, (a1)
+; NO_FOLDING-NEXT:    vlm.v v12, (a2)
+; NO_FOLDING-NEXT:    vmv.v.i v8, 0
+; NO_FOLDING-NEXT:    vmerge.vim v9, v8, -1, v0
+; NO_FOLDING-NEXT:    vmv1r.v v11, v0
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v10
-; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
-; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
+; NO_FOLDING-NEXT:    vmerge.vim v10, v8, -1, v0
+; NO_FOLDING-NEXT:    vmv1r.v v0, v12
+; NO_FOLDING-NEXT:    vmerge.vim v8, v8, -1, v0
+; NO_FOLDING-NEXT:    vmul.vv v10, v9, v10
 ; NO_FOLDING-NEXT:    li a0, 1
-; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
-; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
-; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
+; NO_FOLDING-NEXT:    vsub.vv v9, v9, v8
+; NO_FOLDING-NEXT:    vmv1r.v v0, v11
+; NO_FOLDING-NEXT:    vsub.vx v8, v8, a0, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v9
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users:
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
-; FOLDING-NEXT:    vlm.v v8, (a0)
-; FOLDING-NEXT:    vlm.v v9, (a1)
-; FOLDING-NEXT:    vlm.v v10, (a2)
-; FOLDING-NEXT:    vmv.v.i v11, 0
-; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
-; FOLDING-NEXT:    vmv1r.v v0, v9
-; FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
+; FOLDING-NEXT:    vlm.v v0, (a0)
+; FOLDING-NEXT:    vlm.v v10, (a1)
+; FOLDING-NEXT:    vlm.v v12, (a2)
+; FOLDING-NEXT:    vmv.v.i v8, 0
+; FOLDING-NEXT:    vmerge.vim v9, v8, -1, v0
+; FOLDING-NEXT:    vmv1r.v v11, v0
 ; FOLDING-NEXT:    vmv1r.v v0, v10
-; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
-; FOLDING-NEXT:    vmul.vv v9, v12, v9
+; FOLDING-NEXT:    vmerge.vim v10, v8, -1, v0
+; FOLDING-NEXT:    vmv1r.v v0, v12
+; FOLDING-NEXT:    vmerge.vim v8, v8, -1, v0
+; FOLDING-NEXT:    vmul.vv v10, v9, v10
 ; FOLDING-NEXT:    li a0, 1
-; FOLDING-NEXT:    vsub.vv v11, v12, v10
-; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
-; FOLDING-NEXT:    vor.vv v8, v9, v10
-; FOLDING-NEXT:    vor.vv v8, v8, v11
+; FOLDING-NEXT:    vsub.vv v9, v9, v8
+; FOLDING-NEXT:    vmv1r.v v0, v11
+; FOLDING-NEXT:    vsub.vx v8, v8, a0, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v8
+; FOLDING-NEXT:    vor.vv v8, v8, v9
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
@@ -448,16 +448,16 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y,
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v0, (a0)
-; NO_FOLDING-NEXT:    vlm.v v8, (a2)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
-; NO_FOLDING-NEXT:    vmv.v.i v10, 0
-; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; NO_FOLDING-NEXT:    vmv.v.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
-; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
-; NO_FOLDING-NEXT:    vmv.v.v v0, v9
-; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vlm.v v10, (a2)
+; NO_FOLDING-NEXT:    vlm.v v11, (a1)
+; NO_FOLDING-NEXT:    vmv.v.i v8, 0
+; NO_FOLDING-NEXT:    vmerge.vim v9, v8, 1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v10
+; NO_FOLDING-NEXT:    vmerge.vim v8, v8, 1, v0
+; NO_FOLDING-NEXT:    vadd.vv v10, v9, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v9, v8
+; NO_FOLDING-NEXT:    vmv.v.v v0, v11
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v9, v0.t
 ; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
 ; NO_FOLDING-NEXT:    ret
 ;
@@ -465,16 +465,16 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y,
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
 ; FOLDING-NEXT:    vlm.v v0, (a0)
-; FOLDING-NEXT:    vlm.v v8, (a2)
-; FOLDING-NEXT:    vlm.v v9, (a1)
-; FOLDING-NEXT:    vmv.v.i v10, 0
-; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; FOLDING-NEXT:    vmv.v.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; FOLDING-NEXT:    vadd.vv v10, v11, v8
-; FOLDING-NEXT:    vsub.vv v8, v11, v8
-; FOLDING-NEXT:    vmv.v.v v0, v9
-; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vlm.v v10, (a2)
+; FOLDING-NEXT:    vlm.v v11, (a1)
+; FOLDING-NEXT:    vmv.v.i v8, 0
+; FOLDING-NEXT:    vmerge.vim v9, v8, 1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v10
+; FOLDING-NEXT:    vmerge.vim v8, v8, 1, v0
+; FOLDING-NEXT:    vadd.vv v10, v9, v8
+; FOLDING-NEXT:    vsub.vv v8, v9, v8
+; FOLDING-NEXT:    vmv.v.v v0, v11
+; FOLDING-NEXT:    vor.vv v10, v10, v9, v0.t
 ; FOLDING-NEXT:    vor.vv v8, v10, v8
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
@@ -496,16 +496,16 @@ define <vscale x 2 x i8> @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v0, (a0)
-; NO_FOLDING-NEXT:    vlm.v v8, (a2)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
-; NO_FOLDING-NEXT:    vmv.v.i v10, 0
-; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
-; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
-; NO_FOLDING-NEXT:    vmv1r.v v0, v9
-; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vlm.v v10, (a2)
+; NO_FOLDING-NEXT:    vlm.v v11, (a1)
+; NO_FOLDING-NEXT:    vmv.v.i v8, 0
+; NO_FOLDING-NEXT:    vmerge.vim v9, v8, 1, v0
+; NO_FOLDING-NEXT:    vmv1r.v v0, v10
+; NO_FOLDING-NEXT:    vmerge.vim v8, v8, 1, v0
+; NO_FOLDING-NEXT:    vadd.vv v10, v9, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v9, v8
+; NO_FOLDING-NEXT:    vmv1r.v v0, v11
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v9, v0.t
 ; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
 ; NO_FOLDING-NEXT:    ret
 ;
@@ -513,16 +513,16 @@ define <vscale x 2 x i8> @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v0, (a0)
-; FOLDING-NEXT:    vlm.v v8, (a2)
-; FOLDING-NEXT:    vlm.v v9, (a1)
-; FOLDING-NEXT:    vmv.v.i v10, 0
-; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; FOLDING-NEXT:    vadd.vv v10, v11, v8
-; FOLDING-NEXT:    vsub.vv v8, v11, v8
-; FOLDING-NEXT:    vmv1r.v v0, v9
-; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vlm.v v10, (a2)
+; FOLDING-NEXT:    vlm.v v11, (a1)
+; FOLDING-NEXT:    vmv.v.i v8, 0
+; FOLDING-NEXT:    vmerge.vim v9, v8, 1, v0
+; FOLDING-NEXT:    vmv1r.v v0, v10
+; FOLDING-NEXT:    vmerge.vim v8, v8, 1, v0
+; FOLDING-NEXT:    vadd.vv v10, v9, v8
+; FOLDING-NEXT:    vsub.vv v8, v9, v8
+; FOLDING-NEXT:    vmv1r.v v0, v11
+; FOLDING-NEXT:    vor.vv v10, v10, v9, v0.t
 ; FOLDING-NEXT:    vor.vv v8, v10, v8
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
index 4457c1002acc7b..eb70f18e714553 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
@@ -211,12 +211,12 @@ define <vscale x 32 x half> @vfmerge_fv_nxv32f16(<vscale x 32 x half> %va, half
 ; CHECK-ZVFHMIN:       # %bb.0:
 ; CHECK-ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-ZVFHMIN-NEXT:    vfmv.v.f v24, fa5
+; CHECK-ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
 ; CHECK-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; CHECK-ZVFHMIN-NEXT:    vmv.v.v v20, v16
+; CHECK-ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
+; CHECK-ZVFHMIN-NEXT:    vmv.v.v v28, v24
 ; CHECK-ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v24, v0
 ; CHECK-ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -448,37 +448,27 @@ define <vscale x 16 x double> @vselect_combine_regression(<vscale x 16 x i64> %v
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vl8re64.v v8, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v24, (a1)
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmseq.vi v24, v16, 0
+; CHECK-NEXT:    vmseq.vi v7, v16, 0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmseq.vi v0, v16, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -492,15 +482,16 @@ define void @vselect_legalize_regression(<vscale x 16 x double> %a, <vscale x 16
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vlm.v v24, (a0)
-; CHECK-NEXT:    vmand.mm v7, v0, v24
+; CHECK-NEXT:    vmand.mm v24, v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a2, a0, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v0, v24, a2
+; CHECK-NEXT:    vmv1r.v v6, v24
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    vs8r.v v8, (a1)
 ; CHECK-NEXT:    slli a0, a0, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 0a5e50160fbc93..0fd48ec4a2170c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -354,15 +354,11 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a1, a3, 3
 ; CHECK-NEXT:    add a1, a0, a1
@@ -373,11 +369,10 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a4, a5, a4
 ; CHECK-NEXT:    srli a3, a3, 2
-; CHECK-NEXT:    vl8re32.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v24, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a3
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    bltu a2, a1, .LBB27_2
@@ -385,17 +380,12 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB27_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -411,15 +401,11 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a2, a1, 3
 ; CHECK-NEXT:    add a2, a0, a2
@@ -430,11 +416,10 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a4, a1, 2
-; CHECK-NEXT:    vl8re32.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v24, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a4
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    bltu a1, a2, .LBB28_2
@@ -442,17 +427,12 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB28_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -698,47 +678,37 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vl8re64.v v8, (a3)
+; CHECK-NEXT:    vl8re64.v v24, (a3)
 ; CHECK-NEXT:    sub a3, a2, a1
 ; CHECK-NEXT:    sltu a4, a2, a3
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a4, a1, 3
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a4
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; CHECK-NEXT:    bltu a2, a1, .LBB48_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB48_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
index 834e7dd85aea06..9821f8b421741e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
@@ -151,10 +151,10 @@ declare <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8>, <
 define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscale x 32 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vsext_nxv32i8_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v13, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -168,7 +168,7 @@ define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscal
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB12_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    vsext.vf4 v24, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
index 706876dc385475..c4c48fbdaf9781 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
@@ -386,10 +386,10 @@ declare <vscale x 32 x half> @llvm.vp.sitofp.nxv32f16.nxv32i32(<vscale x 32 x i3
 define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vsitofp_nxv32f16_nxv32i32:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v24, v0
 ; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    srli a2, a1, 2
 ; ZVFH-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vmv1r.v v25, v0
 ; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFH-NEXT:    slli a1, a1, 1
 ; ZVFH-NEXT:    sub a2, a0, a1
@@ -403,17 +403,17 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFH-NEXT:    mv a0, a1
 ; ZVFH-NEXT:  .LBB25_2:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vmv1r.v v0, v24
+; ZVFH-NEXT:    vmv1r.v v0, v25
 ; ZVFH-NEXT:    vfncvt.f.x.w v24, v8, v0.t
 ; ZVFH-NEXT:    vmv8r.v v8, v24
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vsitofp_nxv32f16_nxv32i32:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    srli a2, a1, 2
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
@@ -444,10 +444,10 @@ declare <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i
 define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vsitofp_nxv32f32_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -461,7 +461,7 @@ define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
index b56a0f40176cf3..230125ceca5a66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
@@ -590,7 +590,7 @@ declare <vscale x 128 x i8> @llvm.vp.ssub.sat.nxv128i8(<vscale x 128 x i8>, <vsc
 define <vscale x 128 x i8> @vssub_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vssub_vi_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
 ; CHECK-NEXT:    csrr a2, vlenb
@@ -607,7 +607,7 @@ define <vscale x 128 x i8> @vssub_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB50_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 128 x i8> @llvm.vp.ssub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> splat (i8 -1), <vscale x 128 x i1> %m, i32 %evl)
@@ -1392,10 +1392,10 @@ declare <vscale x 32 x i32> @llvm.vp.ssub.sat.nxv32i32(<vscale x 32 x i32>, <vsc
 define <vscale x 32 x i32> @vssub_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vssub_vi_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    srli a1, a2, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a1
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a1, a0, a2
@@ -1410,7 +1410,7 @@ define <vscale x 32 x i32> @vssub_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:  .LBB118_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i32> @llvm.vp.ssub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> splat (i32 -1), <vscale x 32 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
index 8275c3081c7c16..70a4e5e8362b1b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
@@ -588,7 +588,7 @@ declare <vscale x 128 x i8> @llvm.vp.usub.sat.nxv128i8(<vscale x 128 x i8>, <vsc
 define <vscale x 128 x i8> @vssubu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vssubu_vi_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
 ; CHECK-NEXT:    csrr a2, vlenb
@@ -605,7 +605,7 @@ define <vscale x 128 x i8> @vssubu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB50_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 128 x i8> @llvm.vp.usub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> splat (i8 -1), <vscale x 128 x i1> %m, i32 %evl)
@@ -1390,10 +1390,10 @@ declare <vscale x 32 x i32> @llvm.vp.usub.sat.nxv32i32(<vscale x 32 x i32>, <vsc
 define <vscale x 32 x i32> @vssubu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vssubu_vi_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    srli a1, a2, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a1
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a1, a0, a2
@@ -1408,7 +1408,7 @@ define <vscale x 32 x i32> @vssubu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:  .LBB118_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i32> @llvm.vp.usub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> splat (i32 -1), <vscale x 32 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
index 4857810e7a1703..7007b40055198b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
@@ -157,25 +157,25 @@ declare <vscale x 15 x i16> @llvm.vp.trunc.nxv15i16.nxv15i64(<vscale x 15 x i64>
 define <vscale x 15 x i16> @vtrunc_nxv15i16_nxv15i64(<vscale x 15 x i64> %a, <vscale x 15 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vtrunc_nxv15i16_nxv15i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v28, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v28, v16, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v18, v28, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v18, v24, 0, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB12_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB12_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v28
 ; CHECK-NEXT:    vnsrl.wi v20, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v20, 0, v0.t
@@ -214,10 +214,10 @@ declare <vscale x 32 x i7> @llvm.vp.trunc.nxv32i7.nxv32i32(<vscale x 32 x i32>,
 define <vscale x 32 x i7> @vtrunc_nxv32i7_nxv32i32(<vscale x 32 x i32> %a, <vscale x 32 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vtrunc_nxv32i7_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v28, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -225,15 +225,15 @@ define <vscale x 32 x i7> @vtrunc_nxv32i7_nxv32i32(<vscale x 32 x i32> %a, <vsca
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v28, v16, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v18, v28, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v18, v24, 0, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB15_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB15_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v28
 ; CHECK-NEXT:    vnsrl.wi v20, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v20, 0, v0.t
@@ -248,10 +248,10 @@ declare <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32>,
 define <vscale x 32 x i8> @vtrunc_nxv32i8_nxv32i32(<vscale x 32 x i32> %a, <vscale x 32 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vtrunc_nxv32i8_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v28, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -259,15 +259,15 @@ define <vscale x 32 x i8> @vtrunc_nxv32i8_nxv32i32(<vscale x 32 x i32> %a, <vsca
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v28, v16, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v18, v28, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v18, v24, 0, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB16_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB16_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v28
 ; CHECK-NEXT:    vnsrl.wi v20, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v20, 0, v0.t
@@ -288,7 +288,6 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -297,49 +296,51 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    srli a4, a1, 2
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v0, a4
-; CHECK-NEXT:    slli a4, a1, 3
-; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    vl8re64.v v8, (a4)
-; CHECK-NEXT:    slli a4, a1, 1
-; CHECK-NEXT:    sub a5, a2, a4
-; CHECK-NEXT:    sltu a6, a2, a5
+; CHECK-NEXT:    srli a5, a1, 3
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a5
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vslidedown.vx v25, v0, a3
+; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vl8re64.v v16, (a3)
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    slli a3, a1, 1
+; CHECK-NEXT:    sub a4, a2, a3
+; CHECK-NEXT:    sltu a6, a2, a4
 ; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    sub a6, a5, a1
-; CHECK-NEXT:    sltu a7, a5, a6
+; CHECK-NEXT:    and a4, a6, a4
+; CHECK-NEXT:    sub a6, a4, a1
+; CHECK-NEXT:    sltu a7, a4, a6
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    and a6, a7, a6
-; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vslidedown.vx v0, v16, a3
-; CHECK-NEXT:    vsetvli zero, a6, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v20, v8, 0, v0.t
-; CHECK-NEXT:    bltu a5, a1, .LBB17_2
+; CHECK-NEXT:    and a0, a7, a6
+; CHECK-NEXT:    vmv1r.v v24, v25
+; CHECK-NEXT:    vslidedown.vx v0, v25, a5
+; CHECK-NEXT:    bltu a4, a1, .LBB17_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a5, a1
+; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB17_2:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v7, a3
-; CHECK-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vnsrl.wi v16, v24, 0, v0.t
-; CHECK-NEXT:    bltu a2, a4, .LBB17_4
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v28, v16, 0, v0.t
+; CHECK-NEXT:    vsetvli zero, a4, e32, m4, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vnsrl.wi v24, v8, 0, v0.t
+; CHECK-NEXT:    bltu a2, a3, .LBB17_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv a2, a4
+; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB17_4:
 ; CHECK-NEXT:    sub a0, a2, a1
 ; CHECK-NEXT:    sltu a3, a2, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnsrl.wi v28, v8, 0, v0.t
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vnsrl.wi v12, v16, 0, v0.t
 ; CHECK-NEXT:    bltu a2, a1, .LBB17_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a1
@@ -350,9 +351,9 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnsrl.wi v24, v8, 0, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
+; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
index e083d594db25df..67082c748afbcb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
@@ -386,10 +386,10 @@ declare <vscale x 32 x half> @llvm.vp.uitofp.nxv32f16.nxv32i32(<vscale x 32 x i3
 define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vuitofp_nxv32f16_nxv32i32:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vmv1r.v v24, v0
 ; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    srli a2, a1, 2
 ; ZVFH-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vmv1r.v v25, v0
 ; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFH-NEXT:    slli a1, a1, 1
 ; ZVFH-NEXT:    sub a2, a0, a1
@@ -403,17 +403,17 @@ define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFH-NEXT:    mv a0, a1
 ; ZVFH-NEXT:  .LBB25_2:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vmv1r.v v0, v24
+; ZVFH-NEXT:    vmv1r.v v0, v25
 ; ZVFH-NEXT:    vfncvt.f.xu.w v24, v8, v0.t
 ; ZVFH-NEXT:    vmv8r.v v8, v24
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vuitofp_nxv32f16_nxv32i32:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    srli a2, a1, 2
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
@@ -444,10 +444,10 @@ declare <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i
 define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vuitofp_nxv32f32_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -461,7 +461,7 @@ define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
index 400f89b1ef77d1..47c4e35f264fbb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
@@ -151,10 +151,10 @@ declare <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8>, <
 define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscale x 32 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vzext_nxv32i8_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v13, v0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
@@ -168,7 +168,7 @@ define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscal
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB12_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    vzext.vf4 v24, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret

>From 299c3dd9cc41183aa586bd9444095f9bdeea3c80 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Fri, 12 Apr 2024 18:36:47 +0800
Subject: [PATCH 2/4] Format and clean code

Created using spr 1.3.6-beta.1
---
 llvm/lib/Target/RISCV/RISCVFoldMasks.cpp    | 2 +-
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 8 +-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
index 0425e1411107e0..39f2d8444bd61e 100644
--- a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
@@ -50,7 +50,7 @@ class RISCVFoldMasks : public MachineFunctionPass {
   bool convertToUnmasked(MachineInstr &MI) const;
   bool convertVMergeToVMv(MachineInstr &MI) const;
 
-  bool isAllOnesMask(const MachineOperand& MaskOp) const;
+  bool isAllOnesMask(const MachineOperand &MaskOp) const;
 };
 
 } // namespace
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 81543768d6a646..728117323bcefe 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3425,8 +3425,7 @@ static bool usesAllOnesMask(SDValue MaskOp) {
   // TODO: Check that the VMSET is the expected bitwidth? The pseudo has
   // undefined behaviour if it's the wrong bitwidth, so we could choose to
   // assume that it's all-ones? Same applies to its VL.
-  return MaskOp->isMachineOpcode() &&
-         IsVMSet(MaskOp.getMachineOpcode());
+  return MaskOp->isMachineOpcode() && IsVMSet(MaskOp.getMachineOpcode());
 }
 
 // Return true if we can make sure mask of N is all-ones mask.
@@ -3477,11 +3476,6 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) {
     Ops.push_back(Op);
   }
 
-  // // Transitively apply any node glued to our new node.
-  // const auto *Glued = N->getGluedNode();
-  // if (auto *TGlued = Glued->getGluedNode())
-  //   Ops.push_back(SDValue(TGlued, TGlued->getNumValues() - 1));
-
   MachineSDNode *Result =
       CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
 

>From f36c41433468babace6750b96cd00aaa4297dccc Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Mon, 15 Apr 2024 12:14:34 +0800
Subject: [PATCH 3/4] Remove unneeded changes

Created using spr 1.3.6-beta.1
---
 llvm/lib/Target/RISCV/RISCVFoldMasks.cpp   | 1 -
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
index 39f2d8444bd61e..4c424440aa9729 100644
--- a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
@@ -123,7 +123,6 @@ bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI) const {
                                            TRI->lookThruCopyLike(FalseReg, MRI))
     return false;
 
-  // assert(MI.getOperand(4).isReg() && MI.getOperand(4).getReg() == RISCV::V0);
   if (!isAllOnesMask(MI.getOperand(4)))
     return false;
 
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 66ed3f51662cac..316daf2763ca1e 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -549,7 +549,7 @@ def VRM8 : VReg<VM8VTs, (add V8M8, V16M8, V24M8, V0M8), 8>;
 
 def VRM8NoV0 : VReg<VM8VTs, (sub VRM8, V0M8), 8>;
 
-def VMV0 : VReg<VMaskVTs, (add V0, V0, V0, V0, V0), 1>;
+def VMV0 : VReg<VMaskVTs, (add V0), 1>;
 
 let RegInfos = XLenRI in {
 def GPRF16  : RISCVRegisterClass<[f16], 16, (add GPR)>;

>From 910526051250bc37cd7912a524865860d4382f85 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Mon, 15 Apr 2024 12:25:37 +0800
Subject: [PATCH 4/4] Update vmsge.ll and vmsgeu.ll

Created using spr 1.3.6-beta.1
---
 llvm/test/CodeGen/RISCV/rvv/vmsge.ll  | 114 +++++++++++---------------
 llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll | 114 +++++++++++---------------
 2 files changed, 96 insertions(+), 132 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsge.ll b/llvm/test/CodeGen/RISCV/rvv/vmsge.ll
index 502fb9b24148f7..2b892081105837 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsge.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsge.mask.nxv1i8(
 define <vscale x 1 x i1> @intrinsic_vmsge_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsge.mask.nxv2i8(
 define <vscale x 2 x i1> @intrinsic_vmsge_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsge.mask.nxv4i8(
 define <vscale x 4 x i1> @intrinsic_vmsge_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsge.mask.nxv8i8(
 define <vscale x 8 x i1> @intrinsic_vmsge_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsge.mask.nxv16i8(
 define <vscale x 16 x i1> @intrinsic_vmsge_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vmsle.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v10, v8
+; CHECK-NEXT:    vmsle.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsle.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsge.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsge.mask.nxv32i8(
 define <vscale x 32 x i1> @intrinsic_vmsge_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vmsle.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v12, v8
+; CHECK-NEXT:    vmsle.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsle.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 32 x i1> @llvm.riscv.vmsge.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsge.mask.nxv1i16(
 define <vscale x 1 x i1> @intrinsic_vmsge_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsge.mask.nxv2i16(
 define <vscale x 2 x i1> @intrinsic_vmsge_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsge.mask.nxv4i16(
 define <vscale x 4 x i1> @intrinsic_vmsge_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsge.mask.nxv8i16(
 define <vscale x 8 x i1> @intrinsic_vmsge_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmsle.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v10, v8
+; CHECK-NEXT:    vmsle.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsle.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsge.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsge.mask.nxv16i16(
 define <vscale x 16 x i1> @intrinsic_vmsge_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmsle.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v12, v8
+; CHECK-NEXT:    vmsle.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsle.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsge.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsge.mask.nxv1i32(
 define <vscale x 1 x i1> @intrinsic_vmsge_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsge.mask.nxv2i32(
 define <vscale x 2 x i1> @intrinsic_vmsge_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsge.mask.nxv4i32(
 define <vscale x 4 x i1> @intrinsic_vmsge_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmsle.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v10, v8
+; CHECK-NEXT:    vmsle.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsle.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsge.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsge.mask.nxv8i32(
 define <vscale x 8 x i1> @intrinsic_vmsge_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmsle.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v12, v8
+; CHECK-NEXT:    vmsle.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsle.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsge.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsge.mask.nxv1i64(
 define <vscale x 1 x i1> @intrinsic_vmsge_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsge.mask.nxv2i64(
 define <vscale x 2 x i1> @intrinsic_vmsge_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmsle.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v10, v8
+; CHECK-NEXT:    vmsle.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsle.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmsge.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsge.mask.nxv4i64(
 define <vscale x 4 x i1> @intrinsic_vmsge_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmsle.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsle.vv v0, v12, v8
+; CHECK-NEXT:    vmsle.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsle.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsge.nxv4i64(
@@ -1708,11 +1690,11 @@ define <vscale x 1 x i1> @intrinsic_vmsge_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vmsle.vv v10, v11, v8, v0.t
-; RV32-NEXT:    vmv.v.v v0, v10
+; RV32-NEXT:    vmsle.vv v11, v10, v8, v0.t
+; RV32-NEXT:    vmv.v.v v0, v11
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll
index 9410a99d81423f..b0ea082780d4b1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgeu.mask.nxv1i8(
 define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgeu.mask.nxv2i8(
 define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgeu.mask.nxv4i8(
 define <vscale x 4 x i1> @intrinsic_vmsgeu_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgeu.mask.nxv8i8(
 define <vscale x 8 x i1> @intrinsic_vmsgeu_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsgeu.mask.nxv16i8(
 define <vscale x 16 x i1> @intrinsic_vmsgeu_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v10, v8
+; CHECK-NEXT:    vmsleu.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsleu.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsgeu.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsgeu.mask.nxv32i8(
 define <vscale x 32 x i1> @intrinsic_vmsgeu_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v12, v8
+; CHECK-NEXT:    vmsleu.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsleu.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 32 x i1> @llvm.riscv.vmsgeu.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgeu.mask.nxv1i16(
 define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgeu.mask.nxv2i16(
 define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgeu.mask.nxv4i16(
 define <vscale x 4 x i1> @intrinsic_vmsgeu_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgeu.mask.nxv8i16(
 define <vscale x 8 x i1> @intrinsic_vmsgeu_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v10, v8
+; CHECK-NEXT:    vmsleu.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsleu.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsgeu.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsgeu.mask.nxv16i16(
 define <vscale x 16 x i1> @intrinsic_vmsgeu_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v12, v8
+; CHECK-NEXT:    vmsleu.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsleu.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 16 x i1> @llvm.riscv.vmsgeu.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgeu.mask.nxv1i32(
 define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgeu.mask.nxv2i32(
 define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgeu.mask.nxv4i32(
 define <vscale x 4 x i1> @intrinsic_vmsgeu_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v10, v8
+; CHECK-NEXT:    vmsleu.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsleu.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsgeu.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgeu.mask.nxv8i32(
 define <vscale x 8 x i1> @intrinsic_vmsgeu_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v12, v8
+; CHECK-NEXT:    vmsleu.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsleu.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 8 x i1> @llvm.riscv.vmsgeu.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgeu.mask.nxv1i64(
 define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgeu.mask.nxv2i64(
 define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v14, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v14, v10, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v10, v8
+; CHECK-NEXT:    vmsleu.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v14
-; CHECK-NEXT:    vmsleu.vv v8, v12, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 2 x i1> @llvm.riscv.vmsgeu.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgeu.mask.nxv4i64(
 define <vscale x 4 x i1> @intrinsic_vmsgeu_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v20, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v20, v12, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmsleu.vv v0, v12, v8
+; CHECK-NEXT:    vmsleu.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v20
-; CHECK-NEXT:    vmsleu.vv v8, v16, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
 entry:
   %mask = call <vscale x 4 x i1> @llvm.riscv.vmsgeu.nxv4i64(
@@ -1708,11 +1690,11 @@ define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vmsleu.vv v10, v11, v8, v0.t
-; RV32-NEXT:    vmv.v.v v0, v10
+; RV32-NEXT:    vmsleu.vv v11, v10, v8, v0.t
+; RV32-NEXT:    vmv.v.v v0, v11
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;



More information about the llvm-branch-commits mailing list